diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24965 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.335609430339745, + "eval_steps": 500, + "global_step": 140000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00047700251618827287, + "grad_norm": 0.7297705411911011, + "learning_rate": 0.0001, + "loss": 9.0134, + "num_input_tokens_seen": 26214400, + "step": 50 + }, + { + "epoch": 0.0009540050323765457, + "grad_norm": 0.9636281132698059, + "learning_rate": 0.0002, + "loss": 7.1965, + "num_input_tokens_seen": 52419296, + "step": 100 + }, + { + "epoch": 0.0014310075485648188, + "grad_norm": 0.538168728351593, + "learning_rate": 0.0003, + "loss": 6.2716, + "num_input_tokens_seen": 78626240, + "step": 150 + }, + { + "epoch": 0.0019080100647530915, + "grad_norm": 0.9969781041145325, + "learning_rate": 0.0004, + "loss": 5.6452, + "num_input_tokens_seen": 104833632, + "step": 200 + }, + { + "epoch": 0.0023850125809413646, + "grad_norm": 0.8100546002388, + "learning_rate": 0.0005, + "loss": 5.2224, + "num_input_tokens_seen": 131047584, + "step": 250 + }, + { + "epoch": 0.0028620150971296375, + "grad_norm": 0.4632050693035126, + "learning_rate": 0.0006, + "loss": 4.8733, + "num_input_tokens_seen": 157260544, + "step": 300 + }, + { + "epoch": 0.0033390176133179105, + "grad_norm": 2.2721216678619385, + "learning_rate": 0.0007, + "loss": 4.5928, + "num_input_tokens_seen": 183472896, + "step": 350 + }, + { + "epoch": 0.003816020129506183, + "grad_norm": 0.244660422205925, + "learning_rate": 0.0008, + "loss": 4.4136, + "num_input_tokens_seen": 209685664, + "step": 400 + }, + { + "epoch": 0.004293022645694456, + "grad_norm": 0.3138381540775299, + "learning_rate": 0.0009000000000000001, + "loss": 4.0552, + "num_input_tokens_seen": 235896448, + "step": 450 + }, + { + "epoch": 0.004770025161882729, + "grad_norm": 0.27429279685020447, + "learning_rate": 0.001, + "loss": 3.8584, + "num_input_tokens_seen": 262104320, + "step": 500 + }, + { + "epoch": 0.004770025161882729, + "eval_loss": 3.676422357559204, + "eval_runtime": 80.6607, + "eval_samples_per_second": 61.988, + "eval_steps_per_second": 15.497, + "num_input_tokens_seen": 262104320, + "step": 500 + }, + { + "epoch": 0.005247027678071002, + "grad_norm": 0.20896735787391663, + "learning_rate": 0.001, + "loss": 3.6992, + "num_input_tokens_seen": 288318720, + "step": 550 + }, + { + "epoch": 0.005724030194259275, + "grad_norm": 0.1993994414806366, + "learning_rate": 0.001, + "loss": 3.5839, + "num_input_tokens_seen": 314519904, + "step": 600 + }, + { + "epoch": 0.0062010327104475476, + "grad_norm": 0.21841953694820404, + "learning_rate": 0.001, + "loss": 3.472, + "num_input_tokens_seen": 340725984, + "step": 650 + }, + { + "epoch": 0.006678035226635821, + "grad_norm": 0.17900177836418152, + "learning_rate": 0.001, + "loss": 3.3972, + "num_input_tokens_seen": 366937216, + "step": 700 + }, + { + "epoch": 0.007155037742824093, + "grad_norm": 0.1761142611503601, + "learning_rate": 0.001, + "loss": 3.3383, + "num_input_tokens_seen": 393150592, + "step": 750 + }, + { + "epoch": 0.007632040259012366, + "grad_norm": 0.19628353416919708, + "learning_rate": 0.001, + "loss": 3.2933, + "num_input_tokens_seen": 419364992, + "step": 800 + }, + { + "epoch": 0.008109042775200638, + "grad_norm": 0.18105614185333252, + "learning_rate": 0.001, + "loss": 3.2451, + "num_input_tokens_seen": 445578432, + "step": 850 + }, + { + "epoch": 0.008586045291388912, + "grad_norm": 0.15662750601768494, + "learning_rate": 0.001, + "loss": 3.2089, + "num_input_tokens_seen": 471790144, + "step": 900 + }, + { + "epoch": 0.009063047807577185, + "grad_norm": 0.16136744618415833, + "learning_rate": 0.001, + "loss": 3.1683, + "num_input_tokens_seen": 498004544, + "step": 950 + }, + { + "epoch": 0.009540050323765458, + "grad_norm": 0.16362419724464417, + "learning_rate": 0.001, + "loss": 3.1457, + "num_input_tokens_seen": 524218944, + "step": 1000 + }, + { + "epoch": 0.009540050323765458, + "eval_loss": 3.0411295890808105, + "eval_runtime": 80.4796, + "eval_samples_per_second": 62.128, + "eval_steps_per_second": 15.532, + "num_input_tokens_seen": 524218944, + "step": 1000 + }, + { + "epoch": 0.01001705283995373, + "grad_norm": 0.15351758897304535, + "learning_rate": 0.001, + "loss": 3.1183, + "num_input_tokens_seen": 550433344, + "step": 1050 + }, + { + "epoch": 0.010494055356142003, + "grad_norm": 0.1585114449262619, + "learning_rate": 0.001, + "loss": 3.085, + "num_input_tokens_seen": 576647744, + "step": 1100 + }, + { + "epoch": 0.010971057872330277, + "grad_norm": 0.14503733813762665, + "learning_rate": 0.001, + "loss": 3.0629, + "num_input_tokens_seen": 602861312, + "step": 1150 + }, + { + "epoch": 0.01144806038851855, + "grad_norm": 0.14188329875469208, + "learning_rate": 0.001, + "loss": 3.0349, + "num_input_tokens_seen": 629073184, + "step": 1200 + }, + { + "epoch": 0.011925062904706822, + "grad_norm": 0.14688891172409058, + "learning_rate": 0.001, + "loss": 3.0177, + "num_input_tokens_seen": 655280128, + "step": 1250 + }, + { + "epoch": 0.012402065420895095, + "grad_norm": 0.1432763636112213, + "learning_rate": 0.001, + "loss": 3.0108, + "num_input_tokens_seen": 681494048, + "step": 1300 + }, + { + "epoch": 0.012879067937083368, + "grad_norm": 0.13625910878181458, + "learning_rate": 0.001, + "loss": 2.9809, + "num_input_tokens_seen": 707703840, + "step": 1350 + }, + { + "epoch": 0.013356070453271642, + "grad_norm": 0.13593867421150208, + "learning_rate": 0.001, + "loss": 2.9671, + "num_input_tokens_seen": 733906080, + "step": 1400 + }, + { + "epoch": 0.013833072969459913, + "grad_norm": 0.1373436599969864, + "learning_rate": 0.001, + "loss": 2.9487, + "num_input_tokens_seen": 760116640, + "step": 1450 + }, + { + "epoch": 0.014310075485648187, + "grad_norm": 0.13173167407512665, + "learning_rate": 0.001, + "loss": 2.9472, + "num_input_tokens_seen": 786331040, + "step": 1500 + }, + { + "epoch": 0.014310075485648187, + "eval_loss": 2.8491108417510986, + "eval_runtime": 81.5351, + "eval_samples_per_second": 61.323, + "eval_steps_per_second": 15.331, + "num_input_tokens_seen": 786331040, + "step": 1500 + }, + { + "epoch": 0.01478707800183646, + "grad_norm": 0.12925633788108826, + "learning_rate": 0.001, + "loss": 2.9319, + "num_input_tokens_seen": 812544672, + "step": 1550 + }, + { + "epoch": 0.015264080518024732, + "grad_norm": 0.1303853690624237, + "learning_rate": 0.001, + "loss": 2.9005, + "num_input_tokens_seen": 838745248, + "step": 1600 + }, + { + "epoch": 0.015741083034213007, + "grad_norm": 0.13408446311950684, + "learning_rate": 0.001, + "loss": 2.9056, + "num_input_tokens_seen": 864955200, + "step": 1650 + }, + { + "epoch": 0.016218085550401277, + "grad_norm": 0.12651245296001434, + "learning_rate": 0.001, + "loss": 2.8906, + "num_input_tokens_seen": 891169600, + "step": 1700 + }, + { + "epoch": 0.01669508806658955, + "grad_norm": 0.12796996533870697, + "learning_rate": 0.001, + "loss": 2.867, + "num_input_tokens_seen": 917366784, + "step": 1750 + }, + { + "epoch": 0.017172090582777823, + "grad_norm": 0.12910747528076172, + "learning_rate": 0.001, + "loss": 2.8778, + "num_input_tokens_seen": 943571520, + "step": 1800 + }, + { + "epoch": 0.017649093098966097, + "grad_norm": 0.1252501755952835, + "learning_rate": 0.001, + "loss": 2.862, + "num_input_tokens_seen": 969782240, + "step": 1850 + }, + { + "epoch": 0.01812609561515437, + "grad_norm": 0.12633615732192993, + "learning_rate": 0.001, + "loss": 2.8504, + "num_input_tokens_seen": 995995232, + "step": 1900 + }, + { + "epoch": 0.018603098131342644, + "grad_norm": 0.13014598190784454, + "learning_rate": 0.001, + "loss": 2.8469, + "num_input_tokens_seen": 1022209632, + "step": 1950 + }, + { + "epoch": 0.019080100647530917, + "grad_norm": 0.12546297907829285, + "learning_rate": 0.001, + "loss": 2.8357, + "num_input_tokens_seen": 1048424032, + "step": 2000 + }, + { + "epoch": 0.019080100647530917, + "eval_loss": 2.7469162940979004, + "eval_runtime": 81.156, + "eval_samples_per_second": 61.61, + "eval_steps_per_second": 15.402, + "num_input_tokens_seen": 1048424032, + "step": 2000 + }, + { + "epoch": 0.01955710316371919, + "grad_norm": 0.12093319743871689, + "learning_rate": 0.001, + "loss": 2.8284, + "num_input_tokens_seen": 1074632192, + "step": 2050 + }, + { + "epoch": 0.02003410567990746, + "grad_norm": 0.12063843011856079, + "learning_rate": 0.001, + "loss": 2.8207, + "num_input_tokens_seen": 1100838272, + "step": 2100 + }, + { + "epoch": 0.020511108196095734, + "grad_norm": 0.12201642990112305, + "learning_rate": 0.001, + "loss": 2.82, + "num_input_tokens_seen": 1127052672, + "step": 2150 + }, + { + "epoch": 0.020988110712284007, + "grad_norm": 0.1250978708267212, + "learning_rate": 0.001, + "loss": 2.7882, + "num_input_tokens_seen": 1153256992, + "step": 2200 + }, + { + "epoch": 0.02146511322847228, + "grad_norm": 0.12314685434103012, + "learning_rate": 0.001, + "loss": 2.7836, + "num_input_tokens_seen": 1179454080, + "step": 2250 + }, + { + "epoch": 0.021942115744660554, + "grad_norm": 0.12332645803689957, + "learning_rate": 0.001, + "loss": 2.7729, + "num_input_tokens_seen": 1205657440, + "step": 2300 + }, + { + "epoch": 0.022419118260848827, + "grad_norm": 0.12105974555015564, + "learning_rate": 0.001, + "loss": 2.7852, + "num_input_tokens_seen": 1231864768, + "step": 2350 + }, + { + "epoch": 0.0228961207770371, + "grad_norm": 0.1196957528591156, + "learning_rate": 0.001, + "loss": 2.7846, + "num_input_tokens_seen": 1258076896, + "step": 2400 + }, + { + "epoch": 0.02337312329322537, + "grad_norm": 0.11613446474075317, + "learning_rate": 0.001, + "loss": 2.7633, + "num_input_tokens_seen": 1284287264, + "step": 2450 + }, + { + "epoch": 0.023850125809413644, + "grad_norm": 0.11567731201648712, + "learning_rate": 0.001, + "loss": 2.7564, + "num_input_tokens_seen": 1310500000, + "step": 2500 + }, + { + "epoch": 0.023850125809413644, + "eval_loss": 2.682379722595215, + "eval_runtime": 81.2078, + "eval_samples_per_second": 61.57, + "eval_steps_per_second": 15.393, + "num_input_tokens_seen": 1310500000, + "step": 2500 + }, + { + "epoch": 0.024327128325601917, + "grad_norm": 0.11191745847463608, + "learning_rate": 0.001, + "loss": 2.763, + "num_input_tokens_seen": 1336707360, + "step": 2550 + }, + { + "epoch": 0.02480413084179019, + "grad_norm": 0.12129329890012741, + "learning_rate": 0.001, + "loss": 2.7546, + "num_input_tokens_seen": 1362918912, + "step": 2600 + }, + { + "epoch": 0.025281133357978464, + "grad_norm": 0.11706431955099106, + "learning_rate": 0.001, + "loss": 2.7437, + "num_input_tokens_seen": 1389127904, + "step": 2650 + }, + { + "epoch": 0.025758135874166737, + "grad_norm": 0.11681405454874039, + "learning_rate": 0.001, + "loss": 2.7439, + "num_input_tokens_seen": 1415333568, + "step": 2700 + }, + { + "epoch": 0.02623513839035501, + "grad_norm": 0.11216867715120316, + "learning_rate": 0.001, + "loss": 2.7292, + "num_input_tokens_seen": 1441547968, + "step": 2750 + }, + { + "epoch": 0.026712140906543284, + "grad_norm": 0.10973285883665085, + "learning_rate": 0.001, + "loss": 2.7223, + "num_input_tokens_seen": 1467757216, + "step": 2800 + }, + { + "epoch": 0.027189143422731554, + "grad_norm": 0.11623947322368622, + "learning_rate": 0.001, + "loss": 2.729, + "num_input_tokens_seen": 1493966976, + "step": 2850 + }, + { + "epoch": 0.027666145938919827, + "grad_norm": 0.11490777134895325, + "learning_rate": 0.001, + "loss": 2.7275, + "num_input_tokens_seen": 1520179488, + "step": 2900 + }, + { + "epoch": 0.0281431484551081, + "grad_norm": 0.11005893349647522, + "learning_rate": 0.001, + "loss": 2.7151, + "num_input_tokens_seen": 1546390432, + "step": 2950 + }, + { + "epoch": 0.028620150971296374, + "grad_norm": 0.11037708073854446, + "learning_rate": 0.001, + "loss": 2.719, + "num_input_tokens_seen": 1572599776, + "step": 3000 + }, + { + "epoch": 0.028620150971296374, + "eval_loss": 2.629970073699951, + "eval_runtime": 79.9671, + "eval_samples_per_second": 62.526, + "eval_steps_per_second": 15.631, + "num_input_tokens_seen": 1572599776, + "step": 3000 + }, + { + "epoch": 0.029097153487484647, + "grad_norm": 0.11220329999923706, + "learning_rate": 0.001, + "loss": 2.7135, + "num_input_tokens_seen": 1598814176, + "step": 3050 + }, + { + "epoch": 0.02957415600367292, + "grad_norm": 0.1040477603673935, + "learning_rate": 0.001, + "loss": 2.7088, + "num_input_tokens_seen": 1625028576, + "step": 3100 + }, + { + "epoch": 0.030051158519861194, + "grad_norm": 0.11202716827392578, + "learning_rate": 0.001, + "loss": 2.6951, + "num_input_tokens_seen": 1651241664, + "step": 3150 + }, + { + "epoch": 0.030528161036049464, + "grad_norm": 0.11337998509407043, + "learning_rate": 0.001, + "loss": 2.6895, + "num_input_tokens_seen": 1677453952, + "step": 3200 + }, + { + "epoch": 0.031005163552237737, + "grad_norm": 0.10991177707910538, + "learning_rate": 0.001, + "loss": 2.6833, + "num_input_tokens_seen": 1703668064, + "step": 3250 + }, + { + "epoch": 0.031482166068426014, + "grad_norm": 0.11068691313266754, + "learning_rate": 0.001, + "loss": 2.6861, + "num_input_tokens_seen": 1729882464, + "step": 3300 + }, + { + "epoch": 0.031959168584614284, + "grad_norm": 0.10345873981714249, + "learning_rate": 0.001, + "loss": 2.6793, + "num_input_tokens_seen": 1756093888, + "step": 3350 + }, + { + "epoch": 0.032436171100802554, + "grad_norm": 0.10937945544719696, + "learning_rate": 0.001, + "loss": 2.6793, + "num_input_tokens_seen": 1782308288, + "step": 3400 + }, + { + "epoch": 0.03291317361699083, + "grad_norm": 0.10656026005744934, + "learning_rate": 0.001, + "loss": 2.6773, + "num_input_tokens_seen": 1808520352, + "step": 3450 + }, + { + "epoch": 0.0333901761331791, + "grad_norm": 0.10830007493495941, + "learning_rate": 0.001, + "loss": 2.664, + "num_input_tokens_seen": 1834732864, + "step": 3500 + }, + { + "epoch": 0.0333901761331791, + "eval_loss": 2.590848684310913, + "eval_runtime": 80.8416, + "eval_samples_per_second": 61.849, + "eval_steps_per_second": 15.462, + "num_input_tokens_seen": 1834732864, + "step": 3500 + }, + { + "epoch": 0.03386717864936738, + "grad_norm": 0.10620469599962234, + "learning_rate": 0.001, + "loss": 2.6712, + "num_input_tokens_seen": 1860937952, + "step": 3550 + }, + { + "epoch": 0.03434418116555565, + "grad_norm": 0.10916534811258316, + "learning_rate": 0.001, + "loss": 2.6589, + "num_input_tokens_seen": 1887151168, + "step": 3600 + }, + { + "epoch": 0.034821183681743924, + "grad_norm": 0.1078685000538826, + "learning_rate": 0.001, + "loss": 2.6528, + "num_input_tokens_seen": 1913358112, + "step": 3650 + }, + { + "epoch": 0.035298186197932194, + "grad_norm": 0.10825319588184357, + "learning_rate": 0.001, + "loss": 2.6542, + "num_input_tokens_seen": 1939568448, + "step": 3700 + }, + { + "epoch": 0.035775188714120464, + "grad_norm": 0.10253206640481949, + "learning_rate": 0.001, + "loss": 2.6568, + "num_input_tokens_seen": 1965778912, + "step": 3750 + }, + { + "epoch": 0.03625219123030874, + "grad_norm": 0.10500983893871307, + "learning_rate": 0.001, + "loss": 2.6474, + "num_input_tokens_seen": 1991992192, + "step": 3800 + }, + { + "epoch": 0.03672919374649701, + "grad_norm": 0.11013150215148926, + "learning_rate": 0.001, + "loss": 2.6529, + "num_input_tokens_seen": 2018206592, + "step": 3850 + }, + { + "epoch": 0.03720619626268529, + "grad_norm": 0.10353852063417435, + "learning_rate": 0.001, + "loss": 2.6254, + "num_input_tokens_seen": 2044403936, + "step": 3900 + }, + { + "epoch": 0.03768319877887356, + "grad_norm": 0.11490489542484283, + "learning_rate": 0.001, + "loss": 2.639, + "num_input_tokens_seen": 2070618336, + "step": 3950 + }, + { + "epoch": 0.038160201295061834, + "grad_norm": 0.10220393538475037, + "learning_rate": 0.001, + "loss": 2.6351, + "num_input_tokens_seen": 2096832320, + "step": 4000 + }, + { + "epoch": 0.038160201295061834, + "eval_loss": 2.5540711879730225, + "eval_runtime": 80.3545, + "eval_samples_per_second": 62.224, + "eval_steps_per_second": 15.556, + "num_input_tokens_seen": 2096832320, + "step": 4000 + }, + { + "epoch": 0.038637203811250104, + "grad_norm": 0.10771480202674866, + "learning_rate": 0.001, + "loss": 2.6332, + "num_input_tokens_seen": 2123043360, + "step": 4050 + }, + { + "epoch": 0.03911420632743838, + "grad_norm": 0.10670652985572815, + "learning_rate": 0.001, + "loss": 2.6188, + "num_input_tokens_seen": 2149257312, + "step": 4100 + }, + { + "epoch": 0.03959120884362665, + "grad_norm": 0.10759977996349335, + "learning_rate": 0.001, + "loss": 2.6244, + "num_input_tokens_seen": 2175471424, + "step": 4150 + }, + { + "epoch": 0.04006821135981492, + "grad_norm": 0.10374791920185089, + "learning_rate": 0.001, + "loss": 2.6311, + "num_input_tokens_seen": 2201682208, + "step": 4200 + }, + { + "epoch": 0.0405452138760032, + "grad_norm": 0.11101187020540237, + "learning_rate": 0.001, + "loss": 2.6163, + "num_input_tokens_seen": 2227895840, + "step": 4250 + }, + { + "epoch": 0.04102221639219147, + "grad_norm": 0.11088625341653824, + "learning_rate": 0.001, + "loss": 2.6197, + "num_input_tokens_seen": 2254107488, + "step": 4300 + }, + { + "epoch": 0.041499218908379744, + "grad_norm": 0.10880734026432037, + "learning_rate": 0.001, + "loss": 2.6176, + "num_input_tokens_seen": 2280321504, + "step": 4350 + }, + { + "epoch": 0.041976221424568014, + "grad_norm": 0.10170961171388626, + "learning_rate": 0.001, + "loss": 2.6257, + "num_input_tokens_seen": 2306535904, + "step": 4400 + }, + { + "epoch": 0.04245322394075629, + "grad_norm": 0.11956244707107544, + "learning_rate": 0.001, + "loss": 2.5995, + "num_input_tokens_seen": 2332750304, + "step": 4450 + }, + { + "epoch": 0.04293022645694456, + "grad_norm": 0.11314979940652847, + "learning_rate": 0.001, + "loss": 2.5897, + "num_input_tokens_seen": 2358960576, + "step": 4500 + }, + { + "epoch": 0.04293022645694456, + "eval_loss": 2.518724203109741, + "eval_runtime": 81.1978, + "eval_samples_per_second": 61.578, + "eval_steps_per_second": 15.395, + "num_input_tokens_seen": 2358960576, + "step": 4500 + }, + { + "epoch": 0.04340722897313283, + "grad_norm": 0.1124994158744812, + "learning_rate": 0.001, + "loss": 2.5951, + "num_input_tokens_seen": 2385174976, + "step": 4550 + }, + { + "epoch": 0.04388423148932111, + "grad_norm": 0.10833606123924255, + "learning_rate": 0.001, + "loss": 2.5963, + "num_input_tokens_seen": 2411389376, + "step": 4600 + }, + { + "epoch": 0.04436123400550938, + "grad_norm": 0.10818412154912949, + "learning_rate": 0.001, + "loss": 2.5792, + "num_input_tokens_seen": 2437602528, + "step": 4650 + }, + { + "epoch": 0.044838236521697654, + "grad_norm": 0.11943142861127853, + "learning_rate": 0.001, + "loss": 2.5782, + "num_input_tokens_seen": 2463816672, + "step": 4700 + }, + { + "epoch": 0.045315239037885924, + "grad_norm": 0.11240798234939575, + "learning_rate": 0.001, + "loss": 2.5745, + "num_input_tokens_seen": 2490026592, + "step": 4750 + }, + { + "epoch": 0.0457922415540742, + "grad_norm": 0.11156616359949112, + "learning_rate": 0.001, + "loss": 2.5825, + "num_input_tokens_seen": 2516240992, + "step": 4800 + }, + { + "epoch": 0.04626924407026247, + "grad_norm": 0.121095672249794, + "learning_rate": 0.001, + "loss": 2.5813, + "num_input_tokens_seen": 2542455392, + "step": 4850 + }, + { + "epoch": 0.04674624658645074, + "grad_norm": 0.11107343435287476, + "learning_rate": 0.001, + "loss": 2.5752, + "num_input_tokens_seen": 2568666624, + "step": 4900 + }, + { + "epoch": 0.04722324910263902, + "grad_norm": 0.10824497044086456, + "learning_rate": 0.001, + "loss": 2.5692, + "num_input_tokens_seen": 2594875456, + "step": 4950 + }, + { + "epoch": 0.04770025161882729, + "grad_norm": 0.11280784755945206, + "learning_rate": 0.001, + "loss": 2.5672, + "num_input_tokens_seen": 2621089856, + "step": 5000 + }, + { + "epoch": 0.04770025161882729, + "eval_loss": 2.490786552429199, + "eval_runtime": 80.5954, + "eval_samples_per_second": 62.038, + "eval_steps_per_second": 15.51, + "num_input_tokens_seen": 2621089856, + "step": 5000 + }, + { + "epoch": 0.048177254135015564, + "grad_norm": 0.11431359499692917, + "learning_rate": 0.001, + "loss": 2.569, + "num_input_tokens_seen": 2647300640, + "step": 5050 + }, + { + "epoch": 0.048654256651203834, + "grad_norm": 0.12080084532499313, + "learning_rate": 0.001, + "loss": 2.5642, + "num_input_tokens_seen": 2673510592, + "step": 5100 + }, + { + "epoch": 0.04913125916739211, + "grad_norm": 0.11316218972206116, + "learning_rate": 0.001, + "loss": 2.5716, + "num_input_tokens_seen": 2699713920, + "step": 5150 + }, + { + "epoch": 0.04960826168358038, + "grad_norm": 0.1254076361656189, + "learning_rate": 0.001, + "loss": 2.5547, + "num_input_tokens_seen": 2725913312, + "step": 5200 + }, + { + "epoch": 0.05008526419976865, + "grad_norm": 0.11621085554361343, + "learning_rate": 0.001, + "loss": 2.5575, + "num_input_tokens_seen": 2752123328, + "step": 5250 + }, + { + "epoch": 0.05056226671595693, + "grad_norm": 0.1208173856139183, + "learning_rate": 0.001, + "loss": 2.5484, + "num_input_tokens_seen": 2778334848, + "step": 5300 + }, + { + "epoch": 0.0510392692321452, + "grad_norm": 0.11889180541038513, + "learning_rate": 0.001, + "loss": 2.5501, + "num_input_tokens_seen": 2804545664, + "step": 5350 + }, + { + "epoch": 0.051516271748333474, + "grad_norm": 0.11486896872520447, + "learning_rate": 0.001, + "loss": 2.5472, + "num_input_tokens_seen": 2830748096, + "step": 5400 + }, + { + "epoch": 0.051993274264521744, + "grad_norm": 0.11431973427534103, + "learning_rate": 0.001, + "loss": 2.5494, + "num_input_tokens_seen": 2856962496, + "step": 5450 + }, + { + "epoch": 0.05247027678071002, + "grad_norm": 0.11589290201663971, + "learning_rate": 0.001, + "loss": 2.5412, + "num_input_tokens_seen": 2883166048, + "step": 5500 + }, + { + "epoch": 0.05247027678071002, + "eval_loss": 2.4638915061950684, + "eval_runtime": 80.0714, + "eval_samples_per_second": 62.444, + "eval_steps_per_second": 15.611, + "num_input_tokens_seen": 2883166048, + "step": 5500 + }, + { + "epoch": 0.05294727929689829, + "grad_norm": 0.11737602949142456, + "learning_rate": 0.001, + "loss": 2.5458, + "num_input_tokens_seen": 2909379680, + "step": 5550 + }, + { + "epoch": 0.05342428181308657, + "grad_norm": 0.11384102702140808, + "learning_rate": 0.001, + "loss": 2.5521, + "num_input_tokens_seen": 2935594080, + "step": 5600 + }, + { + "epoch": 0.05390128432927484, + "grad_norm": 0.12825793027877808, + "learning_rate": 0.001, + "loss": 2.5449, + "num_input_tokens_seen": 2961804896, + "step": 5650 + }, + { + "epoch": 0.05437828684546311, + "grad_norm": 0.11516230553388596, + "learning_rate": 0.001, + "loss": 2.5306, + "num_input_tokens_seen": 2988008128, + "step": 5700 + }, + { + "epoch": 0.054855289361651384, + "grad_norm": 0.11697406321763992, + "learning_rate": 0.001, + "loss": 2.5265, + "num_input_tokens_seen": 3014213824, + "step": 5750 + }, + { + "epoch": 0.055332291877839654, + "grad_norm": 0.1262071430683136, + "learning_rate": 0.001, + "loss": 2.5359, + "num_input_tokens_seen": 3040422720, + "step": 5800 + }, + { + "epoch": 0.05580929439402793, + "grad_norm": 0.11729097366333008, + "learning_rate": 0.001, + "loss": 2.5339, + "num_input_tokens_seen": 3066632448, + "step": 5850 + }, + { + "epoch": 0.0562862969102162, + "grad_norm": 0.12072544544935226, + "learning_rate": 0.001, + "loss": 2.5184, + "num_input_tokens_seen": 3092846848, + "step": 5900 + }, + { + "epoch": 0.05676329942640448, + "grad_norm": 0.12556667625904083, + "learning_rate": 0.001, + "loss": 2.5229, + "num_input_tokens_seen": 3119043104, + "step": 5950 + }, + { + "epoch": 0.05724030194259275, + "grad_norm": 0.13290442526340485, + "learning_rate": 0.001, + "loss": 2.5194, + "num_input_tokens_seen": 3145255744, + "step": 6000 + }, + { + "epoch": 0.05724030194259275, + "eval_loss": 2.442291736602783, + "eval_runtime": 80.47, + "eval_samples_per_second": 62.135, + "eval_steps_per_second": 15.534, + "num_input_tokens_seen": 3145255744, + "step": 6000 + }, + { + "epoch": 0.05771730445878102, + "grad_norm": 0.1415167599916458, + "learning_rate": 0.001, + "loss": 2.5221, + "num_input_tokens_seen": 3171470144, + "step": 6050 + }, + { + "epoch": 0.058194306974969294, + "grad_norm": 0.11889927089214325, + "learning_rate": 0.001, + "loss": 2.5192, + "num_input_tokens_seen": 3197676704, + "step": 6100 + }, + { + "epoch": 0.058671309491157564, + "grad_norm": 0.12153992801904678, + "learning_rate": 0.001, + "loss": 2.5166, + "num_input_tokens_seen": 3223884160, + "step": 6150 + }, + { + "epoch": 0.05914831200734584, + "grad_norm": 0.11614126712083817, + "learning_rate": 0.001, + "loss": 2.5287, + "num_input_tokens_seen": 3250096704, + "step": 6200 + }, + { + "epoch": 0.05962531452353411, + "grad_norm": 0.1198962926864624, + "learning_rate": 0.001, + "loss": 2.5111, + "num_input_tokens_seen": 3276311040, + "step": 6250 + }, + { + "epoch": 0.06010231703972239, + "grad_norm": 0.13005641102790833, + "learning_rate": 0.001, + "loss": 2.509, + "num_input_tokens_seen": 3302517568, + "step": 6300 + }, + { + "epoch": 0.06057931955591066, + "grad_norm": 0.11713956296443939, + "learning_rate": 0.001, + "loss": 2.5089, + "num_input_tokens_seen": 3328719296, + "step": 6350 + }, + { + "epoch": 0.06105632207209893, + "grad_norm": 0.11161922663450241, + "learning_rate": 0.001, + "loss": 2.5082, + "num_input_tokens_seen": 3354930368, + "step": 6400 + }, + { + "epoch": 0.061533324588287204, + "grad_norm": 0.12296202778816223, + "learning_rate": 0.001, + "loss": 2.5102, + "num_input_tokens_seen": 3381142240, + "step": 6450 + }, + { + "epoch": 0.062010327104475474, + "grad_norm": 0.11225474625825882, + "learning_rate": 0.001, + "loss": 2.5105, + "num_input_tokens_seen": 3407356352, + "step": 6500 + }, + { + "epoch": 0.062010327104475474, + "eval_loss": 2.4205968379974365, + "eval_runtime": 80.5402, + "eval_samples_per_second": 62.081, + "eval_steps_per_second": 15.52, + "num_input_tokens_seen": 3407356352, + "step": 6500 + }, + { + "epoch": 0.06248732962066375, + "grad_norm": 0.12702156603336334, + "learning_rate": 0.001, + "loss": 2.5075, + "num_input_tokens_seen": 3433569248, + "step": 6550 + }, + { + "epoch": 0.06296433213685203, + "grad_norm": 0.12885423004627228, + "learning_rate": 0.001, + "loss": 2.4859, + "num_input_tokens_seen": 3459777248, + "step": 6600 + }, + { + "epoch": 0.06344133465304029, + "grad_norm": 0.13935446739196777, + "learning_rate": 0.001, + "loss": 2.5114, + "num_input_tokens_seen": 3485989376, + "step": 6650 + }, + { + "epoch": 0.06391833716922857, + "grad_norm": 0.12149051576852798, + "learning_rate": 0.001, + "loss": 2.4924, + "num_input_tokens_seen": 3512203776, + "step": 6700 + }, + { + "epoch": 0.06439533968541684, + "grad_norm": 0.12380675226449966, + "learning_rate": 0.001, + "loss": 2.4963, + "num_input_tokens_seen": 3538413504, + "step": 6750 + }, + { + "epoch": 0.06487234220160511, + "grad_norm": 0.12020547688007355, + "learning_rate": 0.001, + "loss": 2.4924, + "num_input_tokens_seen": 3564626560, + "step": 6800 + }, + { + "epoch": 0.06534934471779338, + "grad_norm": 0.12433449178934097, + "learning_rate": 0.001, + "loss": 2.4835, + "num_input_tokens_seen": 3590833408, + "step": 6850 + }, + { + "epoch": 0.06582634723398166, + "grad_norm": 0.11073850840330124, + "learning_rate": 0.001, + "loss": 2.49, + "num_input_tokens_seen": 3617045664, + "step": 6900 + }, + { + "epoch": 0.06630334975016994, + "grad_norm": 0.12657274305820465, + "learning_rate": 0.001, + "loss": 2.4922, + "num_input_tokens_seen": 3643256224, + "step": 6950 + }, + { + "epoch": 0.0667803522663582, + "grad_norm": 0.13630461692810059, + "learning_rate": 0.001, + "loss": 2.4816, + "num_input_tokens_seen": 3669468320, + "step": 7000 + }, + { + "epoch": 0.0667803522663582, + "eval_loss": 2.4051008224487305, + "eval_runtime": 80.911, + "eval_samples_per_second": 61.796, + "eval_steps_per_second": 15.449, + "num_input_tokens_seen": 3669468320, + "step": 7000 + }, + { + "epoch": 0.06725735478254648, + "grad_norm": 0.11266546696424484, + "learning_rate": 0.001, + "loss": 2.4822, + "num_input_tokens_seen": 3695674400, + "step": 7050 + }, + { + "epoch": 0.06773435729873475, + "grad_norm": 0.13039050996303558, + "learning_rate": 0.001, + "loss": 2.4885, + "num_input_tokens_seen": 3721882592, + "step": 7100 + }, + { + "epoch": 0.06821135981492302, + "grad_norm": 0.11898328363895416, + "learning_rate": 0.001, + "loss": 2.4756, + "num_input_tokens_seen": 3748091968, + "step": 7150 + }, + { + "epoch": 0.0686883623311113, + "grad_norm": 0.11951896548271179, + "learning_rate": 0.001, + "loss": 2.4719, + "num_input_tokens_seen": 3774297216, + "step": 7200 + }, + { + "epoch": 0.06916536484729957, + "grad_norm": 0.13969680666923523, + "learning_rate": 0.001, + "loss": 2.4706, + "num_input_tokens_seen": 3800509600, + "step": 7250 + }, + { + "epoch": 0.06964236736348785, + "grad_norm": 0.12787151336669922, + "learning_rate": 0.001, + "loss": 2.4738, + "num_input_tokens_seen": 3826723648, + "step": 7300 + }, + { + "epoch": 0.07011936987967611, + "grad_norm": 0.13117018342018127, + "learning_rate": 0.001, + "loss": 2.4735, + "num_input_tokens_seen": 3852920288, + "step": 7350 + }, + { + "epoch": 0.07059637239586439, + "grad_norm": 0.11509765684604645, + "learning_rate": 0.001, + "loss": 2.4648, + "num_input_tokens_seen": 3879127072, + "step": 7400 + }, + { + "epoch": 0.07107337491205266, + "grad_norm": 0.1272098869085312, + "learning_rate": 0.001, + "loss": 2.4806, + "num_input_tokens_seen": 3905334720, + "step": 7450 + }, + { + "epoch": 0.07155037742824093, + "grad_norm": 0.1205294206738472, + "learning_rate": 0.001, + "loss": 2.4688, + "num_input_tokens_seen": 3931543616, + "step": 7500 + }, + { + "epoch": 0.07155037742824093, + "eval_loss": 2.3871288299560547, + "eval_runtime": 81.0356, + "eval_samples_per_second": 61.701, + "eval_steps_per_second": 15.425, + "num_input_tokens_seen": 3931543616, + "step": 7500 + }, + { + "epoch": 0.0720273799444292, + "grad_norm": 0.13648000359535217, + "learning_rate": 0.001, + "loss": 2.4692, + "num_input_tokens_seen": 3957757216, + "step": 7550 + }, + { + "epoch": 0.07250438246061748, + "grad_norm": 0.13873665034770966, + "learning_rate": 0.001, + "loss": 2.4642, + "num_input_tokens_seen": 3983965696, + "step": 7600 + }, + { + "epoch": 0.07298138497680576, + "grad_norm": 0.1256738156080246, + "learning_rate": 0.001, + "loss": 2.4706, + "num_input_tokens_seen": 4010175648, + "step": 7650 + }, + { + "epoch": 0.07345838749299402, + "grad_norm": 0.12166794389486313, + "learning_rate": 0.001, + "loss": 2.4663, + "num_input_tokens_seen": 4036387936, + "step": 7700 + }, + { + "epoch": 0.0739353900091823, + "grad_norm": 0.1347389817237854, + "learning_rate": 0.001, + "loss": 2.4678, + "num_input_tokens_seen": 4062595136, + "step": 7750 + }, + { + "epoch": 0.07441239252537057, + "grad_norm": 0.13025853037834167, + "learning_rate": 0.001, + "loss": 2.4668, + "num_input_tokens_seen": 4088807456, + "step": 7800 + }, + { + "epoch": 0.07488939504155885, + "grad_norm": 0.12036091089248657, + "learning_rate": 0.001, + "loss": 2.4665, + "num_input_tokens_seen": 4115018176, + "step": 7850 + }, + { + "epoch": 0.07536639755774711, + "grad_norm": 0.12124933302402496, + "learning_rate": 0.001, + "loss": 2.4564, + "num_input_tokens_seen": 4141222464, + "step": 7900 + }, + { + "epoch": 0.07584340007393539, + "grad_norm": 0.1202184334397316, + "learning_rate": 0.001, + "loss": 2.4657, + "num_input_tokens_seen": 4167436512, + "step": 7950 + }, + { + "epoch": 0.07632040259012367, + "grad_norm": 0.14438344538211823, + "learning_rate": 0.001, + "loss": 2.4446, + "num_input_tokens_seen": 4193649312, + "step": 8000 + }, + { + "epoch": 0.07632040259012367, + "eval_loss": 2.3720171451568604, + "eval_runtime": 80.9787, + "eval_samples_per_second": 61.745, + "eval_steps_per_second": 15.436, + "num_input_tokens_seen": 4193649312, + "step": 8000 + }, + { + "epoch": 0.07679740510631193, + "grad_norm": 0.13343645632266998, + "learning_rate": 0.001, + "loss": 2.44, + "num_input_tokens_seen": 4219850656, + "step": 8050 + }, + { + "epoch": 0.07727440762250021, + "grad_norm": 0.13672953844070435, + "learning_rate": 0.001, + "loss": 2.4528, + "num_input_tokens_seen": 4246062080, + "step": 8100 + }, + { + "epoch": 0.07775141013868848, + "grad_norm": 0.12469538301229477, + "learning_rate": 0.001, + "loss": 2.4564, + "num_input_tokens_seen": 4272276480, + "step": 8150 + }, + { + "epoch": 0.07822841265487676, + "grad_norm": 0.1281704306602478, + "learning_rate": 0.001, + "loss": 2.4448, + "num_input_tokens_seen": 4298480576, + "step": 8200 + }, + { + "epoch": 0.07870541517106502, + "grad_norm": 0.12879879772663116, + "learning_rate": 0.001, + "loss": 2.4482, + "num_input_tokens_seen": 4324682816, + "step": 8250 + }, + { + "epoch": 0.0791824176872533, + "grad_norm": 0.11960000544786453, + "learning_rate": 0.001, + "loss": 2.4418, + "num_input_tokens_seen": 4350897216, + "step": 8300 + }, + { + "epoch": 0.07965942020344158, + "grad_norm": 0.13047458231449127, + "learning_rate": 0.001, + "loss": 2.4595, + "num_input_tokens_seen": 4377111616, + "step": 8350 + }, + { + "epoch": 0.08013642271962984, + "grad_norm": 0.12718771398067474, + "learning_rate": 0.001, + "loss": 2.4419, + "num_input_tokens_seen": 4403326016, + "step": 8400 + }, + { + "epoch": 0.08061342523581812, + "grad_norm": 0.14239729940891266, + "learning_rate": 0.001, + "loss": 2.4444, + "num_input_tokens_seen": 4429534304, + "step": 8450 + }, + { + "epoch": 0.0810904277520064, + "grad_norm": 0.12223052978515625, + "learning_rate": 0.001, + "loss": 2.4318, + "num_input_tokens_seen": 4455747616, + "step": 8500 + }, + { + "epoch": 0.0810904277520064, + "eval_loss": 2.3614137172698975, + "eval_runtime": 80.6431, + "eval_samples_per_second": 62.002, + "eval_steps_per_second": 15.5, + "num_input_tokens_seen": 4455747616, + "step": 8500 + }, + { + "epoch": 0.08156743026819467, + "grad_norm": 0.1250275820493698, + "learning_rate": 0.001, + "loss": 2.4465, + "num_input_tokens_seen": 4481959552, + "step": 8550 + }, + { + "epoch": 0.08204443278438293, + "grad_norm": 0.13238155841827393, + "learning_rate": 0.001, + "loss": 2.4396, + "num_input_tokens_seen": 4508167424, + "step": 8600 + }, + { + "epoch": 0.08252143530057121, + "grad_norm": 0.12801779806613922, + "learning_rate": 0.001, + "loss": 2.444, + "num_input_tokens_seen": 4534381504, + "step": 8650 + }, + { + "epoch": 0.08299843781675949, + "grad_norm": 0.12822921574115753, + "learning_rate": 0.001, + "loss": 2.4388, + "num_input_tokens_seen": 4560591488, + "step": 8700 + }, + { + "epoch": 0.08347544033294775, + "grad_norm": 0.131358340382576, + "learning_rate": 0.001, + "loss": 2.4305, + "num_input_tokens_seen": 4586805888, + "step": 8750 + }, + { + "epoch": 0.08395244284913603, + "grad_norm": 0.12687794864177704, + "learning_rate": 0.001, + "loss": 2.4341, + "num_input_tokens_seen": 4613017088, + "step": 8800 + }, + { + "epoch": 0.0844294453653243, + "grad_norm": 0.12758538126945496, + "learning_rate": 0.001, + "loss": 2.4328, + "num_input_tokens_seen": 4639231200, + "step": 8850 + }, + { + "epoch": 0.08490644788151258, + "grad_norm": 0.13047289848327637, + "learning_rate": 0.001, + "loss": 2.4381, + "num_input_tokens_seen": 4665445600, + "step": 8900 + }, + { + "epoch": 0.08538345039770084, + "grad_norm": 0.12238621711730957, + "learning_rate": 0.001, + "loss": 2.4278, + "num_input_tokens_seen": 4691659872, + "step": 8950 + }, + { + "epoch": 0.08586045291388912, + "grad_norm": 0.1371585875749588, + "learning_rate": 0.001, + "loss": 2.4292, + "num_input_tokens_seen": 4717856864, + "step": 9000 + }, + { + "epoch": 0.08586045291388912, + "eval_loss": 2.3485047817230225, + "eval_runtime": 80.8327, + "eval_samples_per_second": 61.856, + "eval_steps_per_second": 15.464, + "num_input_tokens_seen": 4717856864, + "step": 9000 + }, + { + "epoch": 0.0863374554300774, + "grad_norm": 0.12939831614494324, + "learning_rate": 0.001, + "loss": 2.4345, + "num_input_tokens_seen": 4744065888, + "step": 9050 + }, + { + "epoch": 0.08681445794626566, + "grad_norm": 0.1290908306837082, + "learning_rate": 0.001, + "loss": 2.4216, + "num_input_tokens_seen": 4770277888, + "step": 9100 + }, + { + "epoch": 0.08729146046245394, + "grad_norm": 0.12267202883958817, + "learning_rate": 0.001, + "loss": 2.4195, + "num_input_tokens_seen": 4796489056, + "step": 9150 + }, + { + "epoch": 0.08776846297864221, + "grad_norm": 0.13644106686115265, + "learning_rate": 0.001, + "loss": 2.418, + "num_input_tokens_seen": 4822694912, + "step": 9200 + }, + { + "epoch": 0.08824546549483049, + "grad_norm": 0.12562055885791779, + "learning_rate": 0.001, + "loss": 2.4262, + "num_input_tokens_seen": 4848909312, + "step": 9250 + }, + { + "epoch": 0.08872246801101875, + "grad_norm": 0.12123631685972214, + "learning_rate": 0.001, + "loss": 2.4178, + "num_input_tokens_seen": 4875119552, + "step": 9300 + }, + { + "epoch": 0.08919947052720703, + "grad_norm": 0.12225483357906342, + "learning_rate": 0.001, + "loss": 2.4146, + "num_input_tokens_seen": 4901319648, + "step": 9350 + }, + { + "epoch": 0.08967647304339531, + "grad_norm": 0.1262338012456894, + "learning_rate": 0.001, + "loss": 2.411, + "num_input_tokens_seen": 4927533024, + "step": 9400 + }, + { + "epoch": 0.09015347555958357, + "grad_norm": 0.12114047259092331, + "learning_rate": 0.001, + "loss": 2.4253, + "num_input_tokens_seen": 4953741376, + "step": 9450 + }, + { + "epoch": 0.09063047807577185, + "grad_norm": 0.12057732045650482, + "learning_rate": 0.001, + "loss": 2.4151, + "num_input_tokens_seen": 4979955776, + "step": 9500 + }, + { + "epoch": 0.09063047807577185, + "eval_loss": 2.3371479511260986, + "eval_runtime": 80.7643, + "eval_samples_per_second": 61.909, + "eval_steps_per_second": 15.477, + "num_input_tokens_seen": 4979955776, + "step": 9500 + }, + { + "epoch": 0.09110748059196012, + "grad_norm": 0.13011221587657928, + "learning_rate": 0.001, + "loss": 2.4187, + "num_input_tokens_seen": 5006160832, + "step": 9550 + }, + { + "epoch": 0.0915844831081484, + "grad_norm": 0.133403941988945, + "learning_rate": 0.001, + "loss": 2.414, + "num_input_tokens_seen": 5032374880, + "step": 9600 + }, + { + "epoch": 0.09206148562433666, + "grad_norm": 0.12261918187141418, + "learning_rate": 0.001, + "loss": 2.4012, + "num_input_tokens_seen": 5058581504, + "step": 9650 + }, + { + "epoch": 0.09253848814052494, + "grad_norm": 0.13203178346157074, + "learning_rate": 0.001, + "loss": 2.4058, + "num_input_tokens_seen": 5084791232, + "step": 9700 + }, + { + "epoch": 0.09301549065671322, + "grad_norm": 0.12036694586277008, + "learning_rate": 0.001, + "loss": 2.4079, + "num_input_tokens_seen": 5111005632, + "step": 9750 + }, + { + "epoch": 0.09349249317290148, + "grad_norm": 0.12211828678846359, + "learning_rate": 0.001, + "loss": 2.4118, + "num_input_tokens_seen": 5137213568, + "step": 9800 + }, + { + "epoch": 0.09396949568908976, + "grad_norm": 0.1405865103006363, + "learning_rate": 0.001, + "loss": 2.4128, + "num_input_tokens_seen": 5163427424, + "step": 9850 + }, + { + "epoch": 0.09444649820527803, + "grad_norm": 0.14212754368782043, + "learning_rate": 0.001, + "loss": 2.4162, + "num_input_tokens_seen": 5189637472, + "step": 9900 + }, + { + "epoch": 0.09492350072146631, + "grad_norm": 0.13048619031906128, + "learning_rate": 0.001, + "loss": 2.4152, + "num_input_tokens_seen": 5215848992, + "step": 9950 + }, + { + "epoch": 0.09540050323765457, + "grad_norm": 0.13322441279888153, + "learning_rate": 0.001, + "loss": 2.4056, + "num_input_tokens_seen": 5242058496, + "step": 10000 + }, + { + "epoch": 0.09540050323765457, + "eval_loss": 2.3262879848480225, + "eval_runtime": 81.7469, + "eval_samples_per_second": 61.164, + "eval_steps_per_second": 15.291, + "num_input_tokens_seen": 5242058496, + "step": 10000 + }, + { + "epoch": 0.09587750575384285, + "grad_norm": 0.12825925648212433, + "learning_rate": 0.001, + "loss": 2.4141, + "num_input_tokens_seen": 5268270688, + "step": 10050 + }, + { + "epoch": 0.09635450827003113, + "grad_norm": 0.12106914073228836, + "learning_rate": 0.001, + "loss": 2.3985, + "num_input_tokens_seen": 5294470400, + "step": 10100 + }, + { + "epoch": 0.09683151078621939, + "grad_norm": 0.12551487982273102, + "learning_rate": 0.001, + "loss": 2.4082, + "num_input_tokens_seen": 5320668704, + "step": 10150 + }, + { + "epoch": 0.09730851330240767, + "grad_norm": 0.12404550611972809, + "learning_rate": 0.001, + "loss": 2.411, + "num_input_tokens_seen": 5346877024, + "step": 10200 + }, + { + "epoch": 0.09778551581859594, + "grad_norm": 0.13011808693408966, + "learning_rate": 0.001, + "loss": 2.4036, + "num_input_tokens_seen": 5373088224, + "step": 10250 + }, + { + "epoch": 0.09826251833478422, + "grad_norm": 0.14489437639713287, + "learning_rate": 0.001, + "loss": 2.4013, + "num_input_tokens_seen": 5399292544, + "step": 10300 + }, + { + "epoch": 0.09873952085097248, + "grad_norm": 0.13484328985214233, + "learning_rate": 0.001, + "loss": 2.4028, + "num_input_tokens_seen": 5425504576, + "step": 10350 + }, + { + "epoch": 0.09921652336716076, + "grad_norm": 0.13810865581035614, + "learning_rate": 0.001, + "loss": 2.3825, + "num_input_tokens_seen": 5451713984, + "step": 10400 + }, + { + "epoch": 0.09969352588334904, + "grad_norm": 0.12903955578804016, + "learning_rate": 0.001, + "loss": 2.4072, + "num_input_tokens_seen": 5477927488, + "step": 10450 + }, + { + "epoch": 0.1001705283995373, + "grad_norm": 0.1321643590927124, + "learning_rate": 0.001, + "loss": 2.3912, + "num_input_tokens_seen": 5504131840, + "step": 10500 + }, + { + "epoch": 0.1001705283995373, + "eval_loss": 2.316988945007324, + "eval_runtime": 80.1877, + "eval_samples_per_second": 62.354, + "eval_steps_per_second": 15.588, + "num_input_tokens_seen": 5504131840, + "step": 10500 + }, + { + "epoch": 0.10064753091572558, + "grad_norm": 0.13744521141052246, + "learning_rate": 0.001, + "loss": 2.3908, + "num_input_tokens_seen": 5530337280, + "step": 10550 + }, + { + "epoch": 0.10112453343191385, + "grad_norm": 0.14102710783481598, + "learning_rate": 0.001, + "loss": 2.4009, + "num_input_tokens_seen": 5556550688, + "step": 10600 + }, + { + "epoch": 0.10160153594810213, + "grad_norm": 0.12428227812051773, + "learning_rate": 0.001, + "loss": 2.4, + "num_input_tokens_seen": 5582754848, + "step": 10650 + }, + { + "epoch": 0.1020785384642904, + "grad_norm": 0.12551705539226532, + "learning_rate": 0.001, + "loss": 2.3978, + "num_input_tokens_seen": 5608963360, + "step": 10700 + }, + { + "epoch": 0.10255554098047867, + "grad_norm": 0.12045067548751831, + "learning_rate": 0.001, + "loss": 2.4, + "num_input_tokens_seen": 5635168960, + "step": 10750 + }, + { + "epoch": 0.10303254349666695, + "grad_norm": 0.12914159893989563, + "learning_rate": 0.001, + "loss": 2.4035, + "num_input_tokens_seen": 5661379520, + "step": 10800 + }, + { + "epoch": 0.10350954601285522, + "grad_norm": 0.1325596123933792, + "learning_rate": 0.001, + "loss": 2.3917, + "num_input_tokens_seen": 5687590496, + "step": 10850 + }, + { + "epoch": 0.10398654852904349, + "grad_norm": 0.13543546199798584, + "learning_rate": 0.001, + "loss": 2.3854, + "num_input_tokens_seen": 5713802208, + "step": 10900 + }, + { + "epoch": 0.10446355104523176, + "grad_norm": 0.12515605986118317, + "learning_rate": 0.001, + "loss": 2.4014, + "num_input_tokens_seen": 5740014432, + "step": 10950 + }, + { + "epoch": 0.10494055356142004, + "grad_norm": 0.12793181836605072, + "learning_rate": 0.001, + "loss": 2.3781, + "num_input_tokens_seen": 5766222432, + "step": 11000 + }, + { + "epoch": 0.10494055356142004, + "eval_loss": 2.3075389862060547, + "eval_runtime": 83.0875, + "eval_samples_per_second": 60.178, + "eval_steps_per_second": 15.044, + "num_input_tokens_seen": 5766222432, + "step": 11000 + }, + { + "epoch": 0.1054175560776083, + "grad_norm": 0.13516350090503693, + "learning_rate": 0.001, + "loss": 2.388, + "num_input_tokens_seen": 5792429984, + "step": 11050 + }, + { + "epoch": 0.10589455859379658, + "grad_norm": 0.13579031825065613, + "learning_rate": 0.001, + "loss": 2.393, + "num_input_tokens_seen": 5818639200, + "step": 11100 + }, + { + "epoch": 0.10637156110998486, + "grad_norm": 0.13308782875537872, + "learning_rate": 0.001, + "loss": 2.3812, + "num_input_tokens_seen": 5844851648, + "step": 11150 + }, + { + "epoch": 0.10684856362617313, + "grad_norm": 0.12415602058172226, + "learning_rate": 0.001, + "loss": 2.3932, + "num_input_tokens_seen": 5871049088, + "step": 11200 + }, + { + "epoch": 0.1073255661423614, + "grad_norm": 0.12303244322538376, + "learning_rate": 0.001, + "loss": 2.3807, + "num_input_tokens_seen": 5897261824, + "step": 11250 + }, + { + "epoch": 0.10780256865854967, + "grad_norm": 0.1346784085035324, + "learning_rate": 0.001, + "loss": 2.3938, + "num_input_tokens_seen": 5923473344, + "step": 11300 + }, + { + "epoch": 0.10827957117473795, + "grad_norm": 0.133702352643013, + "learning_rate": 0.001, + "loss": 2.3807, + "num_input_tokens_seen": 5949683104, + "step": 11350 + }, + { + "epoch": 0.10875657369092621, + "grad_norm": 0.14315365254878998, + "learning_rate": 0.001, + "loss": 2.3837, + "num_input_tokens_seen": 5975894688, + "step": 11400 + }, + { + "epoch": 0.10923357620711449, + "grad_norm": 0.13560393452644348, + "learning_rate": 0.001, + "loss": 2.3843, + "num_input_tokens_seen": 6002107712, + "step": 11450 + }, + { + "epoch": 0.10971057872330277, + "grad_norm": 0.13891252875328064, + "learning_rate": 0.001, + "loss": 2.3776, + "num_input_tokens_seen": 6028313408, + "step": 11500 + }, + { + "epoch": 0.10971057872330277, + "eval_loss": 2.297602653503418, + "eval_runtime": 82.6077, + "eval_samples_per_second": 60.527, + "eval_steps_per_second": 15.132, + "num_input_tokens_seen": 6028313408, + "step": 11500 + }, + { + "epoch": 0.11018758123949104, + "grad_norm": 0.13412249088287354, + "learning_rate": 0.001, + "loss": 2.3752, + "num_input_tokens_seen": 6054524992, + "step": 11550 + }, + { + "epoch": 0.11066458375567931, + "grad_norm": 0.12613041698932648, + "learning_rate": 0.001, + "loss": 2.3818, + "num_input_tokens_seen": 6080738688, + "step": 11600 + }, + { + "epoch": 0.11114158627186758, + "grad_norm": 0.1549839973449707, + "learning_rate": 0.001, + "loss": 2.3803, + "num_input_tokens_seen": 6106953088, + "step": 11650 + }, + { + "epoch": 0.11161858878805586, + "grad_norm": 0.12388636916875839, + "learning_rate": 0.001, + "loss": 2.3816, + "num_input_tokens_seen": 6133164992, + "step": 11700 + }, + { + "epoch": 0.11209559130424412, + "grad_norm": 0.13352590799331665, + "learning_rate": 0.001, + "loss": 2.3708, + "num_input_tokens_seen": 6159376640, + "step": 11750 + }, + { + "epoch": 0.1125725938204324, + "grad_norm": 0.12554074823856354, + "learning_rate": 0.001, + "loss": 2.3723, + "num_input_tokens_seen": 6185587392, + "step": 11800 + }, + { + "epoch": 0.11304959633662068, + "grad_norm": 0.12788288295269012, + "learning_rate": 0.001, + "loss": 2.3847, + "num_input_tokens_seen": 6211799456, + "step": 11850 + }, + { + "epoch": 0.11352659885280895, + "grad_norm": 0.1322234570980072, + "learning_rate": 0.001, + "loss": 2.3766, + "num_input_tokens_seen": 6238009952, + "step": 11900 + }, + { + "epoch": 0.11400360136899722, + "grad_norm": 0.13440632820129395, + "learning_rate": 0.001, + "loss": 2.3852, + "num_input_tokens_seen": 6264216672, + "step": 11950 + }, + { + "epoch": 0.1144806038851855, + "grad_norm": 0.12434106320142746, + "learning_rate": 0.001, + "loss": 2.3759, + "num_input_tokens_seen": 6290430912, + "step": 12000 + }, + { + "epoch": 0.1144806038851855, + "eval_loss": 2.290039539337158, + "eval_runtime": 82.0142, + "eval_samples_per_second": 60.965, + "eval_steps_per_second": 15.241, + "num_input_tokens_seen": 6290430912, + "step": 12000 + }, + { + "epoch": 0.11495760640137377, + "grad_norm": 0.132809117436409, + "learning_rate": 0.001, + "loss": 2.3768, + "num_input_tokens_seen": 6316641216, + "step": 12050 + }, + { + "epoch": 0.11543460891756203, + "grad_norm": 0.12777090072631836, + "learning_rate": 0.001, + "loss": 2.3617, + "num_input_tokens_seen": 6342855616, + "step": 12100 + }, + { + "epoch": 0.11591161143375031, + "grad_norm": 0.1328810453414917, + "learning_rate": 0.001, + "loss": 2.3582, + "num_input_tokens_seen": 6369062880, + "step": 12150 + }, + { + "epoch": 0.11638861394993859, + "grad_norm": 0.13146333396434784, + "learning_rate": 0.001, + "loss": 2.3629, + "num_input_tokens_seen": 6395271424, + "step": 12200 + }, + { + "epoch": 0.11686561646612686, + "grad_norm": 0.13155700266361237, + "learning_rate": 0.001, + "loss": 2.3611, + "num_input_tokens_seen": 6421478368, + "step": 12250 + }, + { + "epoch": 0.11734261898231513, + "grad_norm": 0.13666649162769318, + "learning_rate": 0.001, + "loss": 2.3589, + "num_input_tokens_seen": 6447685344, + "step": 12300 + }, + { + "epoch": 0.1178196214985034, + "grad_norm": 0.12632860243320465, + "learning_rate": 0.001, + "loss": 2.3583, + "num_input_tokens_seen": 6473898912, + "step": 12350 + }, + { + "epoch": 0.11829662401469168, + "grad_norm": 0.12418720871210098, + "learning_rate": 0.001, + "loss": 2.362, + "num_input_tokens_seen": 6500113312, + "step": 12400 + }, + { + "epoch": 0.11877362653087994, + "grad_norm": 0.1381850242614746, + "learning_rate": 0.001, + "loss": 2.3601, + "num_input_tokens_seen": 6526318496, + "step": 12450 + }, + { + "epoch": 0.11925062904706822, + "grad_norm": 0.15137051045894623, + "learning_rate": 0.001, + "loss": 2.3501, + "num_input_tokens_seen": 6552526688, + "step": 12500 + }, + { + "epoch": 0.11925062904706822, + "eval_loss": 2.2824325561523438, + "eval_runtime": 82.5334, + "eval_samples_per_second": 60.582, + "eval_steps_per_second": 15.145, + "num_input_tokens_seen": 6552526688, + "step": 12500 + }, + { + "epoch": 0.1197276315632565, + "grad_norm": 0.11741863191127777, + "learning_rate": 0.001, + "loss": 2.3813, + "num_input_tokens_seen": 6578735776, + "step": 12550 + }, + { + "epoch": 0.12020463407944477, + "grad_norm": 0.11584734171628952, + "learning_rate": 0.001, + "loss": 2.3718, + "num_input_tokens_seen": 6604948416, + "step": 12600 + }, + { + "epoch": 0.12068163659563304, + "grad_norm": 0.13832303881645203, + "learning_rate": 0.001, + "loss": 2.3622, + "num_input_tokens_seen": 6631155680, + "step": 12650 + }, + { + "epoch": 0.12115863911182131, + "grad_norm": 0.13220873475074768, + "learning_rate": 0.001, + "loss": 2.3601, + "num_input_tokens_seen": 6657368192, + "step": 12700 + }, + { + "epoch": 0.12163564162800959, + "grad_norm": 0.13639794290065765, + "learning_rate": 0.001, + "loss": 2.3625, + "num_input_tokens_seen": 6683582592, + "step": 12750 + }, + { + "epoch": 0.12211264414419785, + "grad_norm": 0.12675660848617554, + "learning_rate": 0.001, + "loss": 2.361, + "num_input_tokens_seen": 6709791808, + "step": 12800 + }, + { + "epoch": 0.12258964666038613, + "grad_norm": 0.12696968019008636, + "learning_rate": 0.001, + "loss": 2.3654, + "num_input_tokens_seen": 6735995008, + "step": 12850 + }, + { + "epoch": 0.12306664917657441, + "grad_norm": 0.13134369254112244, + "learning_rate": 0.001, + "loss": 2.358, + "num_input_tokens_seen": 6762206080, + "step": 12900 + }, + { + "epoch": 0.12354365169276268, + "grad_norm": 0.1370420753955841, + "learning_rate": 0.001, + "loss": 2.3435, + "num_input_tokens_seen": 6788420480, + "step": 12950 + }, + { + "epoch": 0.12402065420895095, + "grad_norm": 0.13414695858955383, + "learning_rate": 0.001, + "loss": 2.3572, + "num_input_tokens_seen": 6814626336, + "step": 13000 + }, + { + "epoch": 0.12402065420895095, + "eval_loss": 2.275796413421631, + "eval_runtime": 80.1335, + "eval_samples_per_second": 62.396, + "eval_steps_per_second": 15.599, + "num_input_tokens_seen": 6814626336, + "step": 13000 + }, + { + "epoch": 0.12449765672513922, + "grad_norm": 0.1583530604839325, + "learning_rate": 0.001, + "loss": 2.3532, + "num_input_tokens_seen": 6840830240, + "step": 13050 + }, + { + "epoch": 0.1249746592413275, + "grad_norm": 0.13726601004600525, + "learning_rate": 0.001, + "loss": 2.3478, + "num_input_tokens_seen": 6867035264, + "step": 13100 + }, + { + "epoch": 0.12545166175751576, + "grad_norm": 0.13253213465213776, + "learning_rate": 0.001, + "loss": 2.3525, + "num_input_tokens_seen": 6893243904, + "step": 13150 + }, + { + "epoch": 0.12592866427370406, + "grad_norm": 0.14362353086471558, + "learning_rate": 0.001, + "loss": 2.3557, + "num_input_tokens_seen": 6919452384, + "step": 13200 + }, + { + "epoch": 0.12640566678989232, + "grad_norm": 0.13510292768478394, + "learning_rate": 0.001, + "loss": 2.3496, + "num_input_tokens_seen": 6945653600, + "step": 13250 + }, + { + "epoch": 0.12688266930608058, + "grad_norm": 0.14929993450641632, + "learning_rate": 0.001, + "loss": 2.359, + "num_input_tokens_seen": 6971868000, + "step": 13300 + }, + { + "epoch": 0.12735967182226887, + "grad_norm": 0.14635959267616272, + "learning_rate": 0.001, + "loss": 2.3487, + "num_input_tokens_seen": 6998077856, + "step": 13350 + }, + { + "epoch": 0.12783667433845713, + "grad_norm": 0.129233717918396, + "learning_rate": 0.001, + "loss": 2.3566, + "num_input_tokens_seen": 7024292256, + "step": 13400 + }, + { + "epoch": 0.1283136768546454, + "grad_norm": 0.13718649744987488, + "learning_rate": 0.001, + "loss": 2.3528, + "num_input_tokens_seen": 7050505088, + "step": 13450 + }, + { + "epoch": 0.1287906793708337, + "grad_norm": 0.13179470598697662, + "learning_rate": 0.001, + "loss": 2.3451, + "num_input_tokens_seen": 7076718080, + "step": 13500 + }, + { + "epoch": 0.1287906793708337, + "eval_loss": 2.267688274383545, + "eval_runtime": 80.4123, + "eval_samples_per_second": 62.18, + "eval_steps_per_second": 15.545, + "num_input_tokens_seen": 7076718080, + "step": 13500 + }, + { + "epoch": 0.12926768188702195, + "grad_norm": 0.129612535238266, + "learning_rate": 0.001, + "loss": 2.3466, + "num_input_tokens_seen": 7102928416, + "step": 13550 + }, + { + "epoch": 0.12974468440321021, + "grad_norm": 0.14502273499965668, + "learning_rate": 0.001, + "loss": 2.3514, + "num_input_tokens_seen": 7129138080, + "step": 13600 + }, + { + "epoch": 0.1302216869193985, + "grad_norm": 0.12477376312017441, + "learning_rate": 0.001, + "loss": 2.3498, + "num_input_tokens_seen": 7155346432, + "step": 13650 + }, + { + "epoch": 0.13069868943558677, + "grad_norm": 0.12704899907112122, + "learning_rate": 0.001, + "loss": 2.3568, + "num_input_tokens_seen": 7181560832, + "step": 13700 + }, + { + "epoch": 0.13117569195177506, + "grad_norm": 0.127015620470047, + "learning_rate": 0.001, + "loss": 2.344, + "num_input_tokens_seen": 7207773952, + "step": 13750 + }, + { + "epoch": 0.13165269446796332, + "grad_norm": 0.1374967098236084, + "learning_rate": 0.001, + "loss": 2.3446, + "num_input_tokens_seen": 7233985504, + "step": 13800 + }, + { + "epoch": 0.13212969698415158, + "grad_norm": 0.1342546045780182, + "learning_rate": 0.001, + "loss": 2.3429, + "num_input_tokens_seen": 7260196224, + "step": 13850 + }, + { + "epoch": 0.13260669950033988, + "grad_norm": 0.13680048286914825, + "learning_rate": 0.001, + "loss": 2.3499, + "num_input_tokens_seen": 7286404832, + "step": 13900 + }, + { + "epoch": 0.13308370201652814, + "grad_norm": 0.12522684037685394, + "learning_rate": 0.001, + "loss": 2.3507, + "num_input_tokens_seen": 7312617024, + "step": 13950 + }, + { + "epoch": 0.1335607045327164, + "grad_norm": 0.12328428030014038, + "learning_rate": 0.001, + "loss": 2.3437, + "num_input_tokens_seen": 7338830528, + "step": 14000 + }, + { + "epoch": 0.1335607045327164, + "eval_loss": 2.26138973236084, + "eval_runtime": 82.7425, + "eval_samples_per_second": 60.428, + "eval_steps_per_second": 15.107, + "num_input_tokens_seen": 7338830528, + "step": 14000 + }, + { + "epoch": 0.1340377070489047, + "grad_norm": 0.1246449276804924, + "learning_rate": 0.001, + "loss": 2.353, + "num_input_tokens_seen": 7365043520, + "step": 14050 + }, + { + "epoch": 0.13451470956509295, + "grad_norm": 0.1269921213388443, + "learning_rate": 0.001, + "loss": 2.347, + "num_input_tokens_seen": 7391257920, + "step": 14100 + }, + { + "epoch": 0.13499171208128122, + "grad_norm": 0.13668124377727509, + "learning_rate": 0.001, + "loss": 2.3471, + "num_input_tokens_seen": 7417467648, + "step": 14150 + }, + { + "epoch": 0.1354687145974695, + "grad_norm": 0.15413053333759308, + "learning_rate": 0.001, + "loss": 2.3423, + "num_input_tokens_seen": 7443679424, + "step": 14200 + }, + { + "epoch": 0.13594571711365777, + "grad_norm": 0.14467491209506989, + "learning_rate": 0.001, + "loss": 2.3504, + "num_input_tokens_seen": 7469890208, + "step": 14250 + }, + { + "epoch": 0.13642271962984603, + "grad_norm": 0.14191295206546783, + "learning_rate": 0.001, + "loss": 2.3378, + "num_input_tokens_seen": 7496093536, + "step": 14300 + }, + { + "epoch": 0.13689972214603432, + "grad_norm": 0.14077533781528473, + "learning_rate": 0.001, + "loss": 2.3351, + "num_input_tokens_seen": 7522307936, + "step": 14350 + }, + { + "epoch": 0.1373767246622226, + "grad_norm": 0.13784116506576538, + "learning_rate": 0.001, + "loss": 2.3276, + "num_input_tokens_seen": 7548522112, + "step": 14400 + }, + { + "epoch": 0.13785372717841088, + "grad_norm": 0.13621552288532257, + "learning_rate": 0.001, + "loss": 2.3434, + "num_input_tokens_seen": 7574731968, + "step": 14450 + }, + { + "epoch": 0.13833072969459914, + "grad_norm": 0.1428932249546051, + "learning_rate": 0.001, + "loss": 2.328, + "num_input_tokens_seen": 7600938432, + "step": 14500 + }, + { + "epoch": 0.13833072969459914, + "eval_loss": 2.256176710128784, + "eval_runtime": 82.1088, + "eval_samples_per_second": 60.895, + "eval_steps_per_second": 15.224, + "num_input_tokens_seen": 7600938432, + "step": 14500 + }, + { + "epoch": 0.1388077322107874, + "grad_norm": 0.12382518500089645, + "learning_rate": 0.001, + "loss": 2.3401, + "num_input_tokens_seen": 7627147296, + "step": 14550 + }, + { + "epoch": 0.1392847347269757, + "grad_norm": 0.13391022384166718, + "learning_rate": 0.001, + "loss": 2.3305, + "num_input_tokens_seen": 7653361696, + "step": 14600 + }, + { + "epoch": 0.13976173724316396, + "grad_norm": 0.14608611166477203, + "learning_rate": 0.001, + "loss": 2.3344, + "num_input_tokens_seen": 7679565152, + "step": 14650 + }, + { + "epoch": 0.14023873975935222, + "grad_norm": 0.1222352534532547, + "learning_rate": 0.001, + "loss": 2.3235, + "num_input_tokens_seen": 7705765120, + "step": 14700 + }, + { + "epoch": 0.1407157422755405, + "grad_norm": 0.12659655511379242, + "learning_rate": 0.001, + "loss": 2.335, + "num_input_tokens_seen": 7731972128, + "step": 14750 + }, + { + "epoch": 0.14119274479172877, + "grad_norm": 0.15103894472122192, + "learning_rate": 0.001, + "loss": 2.3439, + "num_input_tokens_seen": 7758179904, + "step": 14800 + }, + { + "epoch": 0.14166974730791704, + "grad_norm": 0.12366761267185211, + "learning_rate": 0.001, + "loss": 2.3447, + "num_input_tokens_seen": 7784391104, + "step": 14850 + }, + { + "epoch": 0.14214674982410533, + "grad_norm": 0.12323159724473953, + "learning_rate": 0.001, + "loss": 2.328, + "num_input_tokens_seen": 7810605504, + "step": 14900 + }, + { + "epoch": 0.1426237523402936, + "grad_norm": 0.13751116394996643, + "learning_rate": 0.001, + "loss": 2.3318, + "num_input_tokens_seen": 7836816160, + "step": 14950 + }, + { + "epoch": 0.14310075485648185, + "grad_norm": 0.1390126645565033, + "learning_rate": 0.001, + "loss": 2.3288, + "num_input_tokens_seen": 7863022432, + "step": 15000 + }, + { + "epoch": 0.14310075485648185, + "eval_loss": 2.2487399578094482, + "eval_runtime": 81.911, + "eval_samples_per_second": 61.042, + "eval_steps_per_second": 15.26, + "num_input_tokens_seen": 7863022432, + "step": 15000 + }, + { + "epoch": 0.14357775737267015, + "grad_norm": 0.13024196028709412, + "learning_rate": 0.001, + "loss": 2.337, + "num_input_tokens_seen": 7889232064, + "step": 15050 + }, + { + "epoch": 0.1440547598888584, + "grad_norm": 0.13981671631336212, + "learning_rate": 0.001, + "loss": 2.33, + "num_input_tokens_seen": 7915444032, + "step": 15100 + }, + { + "epoch": 0.1445317624050467, + "grad_norm": 0.12976309657096863, + "learning_rate": 0.001, + "loss": 2.3276, + "num_input_tokens_seen": 7941654272, + "step": 15150 + }, + { + "epoch": 0.14500876492123496, + "grad_norm": 0.14406299591064453, + "learning_rate": 0.001, + "loss": 2.3245, + "num_input_tokens_seen": 7967865216, + "step": 15200 + }, + { + "epoch": 0.14548576743742322, + "grad_norm": 0.13180013000965118, + "learning_rate": 0.001, + "loss": 2.3251, + "num_input_tokens_seen": 7994074080, + "step": 15250 + }, + { + "epoch": 0.14596276995361152, + "grad_norm": 0.14100609719753265, + "learning_rate": 0.001, + "loss": 2.3342, + "num_input_tokens_seen": 8020287168, + "step": 15300 + }, + { + "epoch": 0.14643977246979978, + "grad_norm": 0.14573803544044495, + "learning_rate": 0.001, + "loss": 2.3251, + "num_input_tokens_seen": 8046494176, + "step": 15350 + }, + { + "epoch": 0.14691677498598804, + "grad_norm": 0.14260108768939972, + "learning_rate": 0.001, + "loss": 2.3391, + "num_input_tokens_seen": 8072706720, + "step": 15400 + }, + { + "epoch": 0.14739377750217633, + "grad_norm": 0.12735863029956818, + "learning_rate": 0.001, + "loss": 2.3285, + "num_input_tokens_seen": 8098918432, + "step": 15450 + }, + { + "epoch": 0.1478707800183646, + "grad_norm": 0.13214413821697235, + "learning_rate": 0.001, + "loss": 2.3259, + "num_input_tokens_seen": 8125131456, + "step": 15500 + }, + { + "epoch": 0.1478707800183646, + "eval_loss": 2.2429773807525635, + "eval_runtime": 83.2826, + "eval_samples_per_second": 60.037, + "eval_steps_per_second": 15.009, + "num_input_tokens_seen": 8125131456, + "step": 15500 + }, + { + "epoch": 0.14834778253455286, + "grad_norm": 0.14493685960769653, + "learning_rate": 0.001, + "loss": 2.3191, + "num_input_tokens_seen": 8151344032, + "step": 15550 + }, + { + "epoch": 0.14882478505074115, + "grad_norm": 0.12741337716579437, + "learning_rate": 0.001, + "loss": 2.3341, + "num_input_tokens_seen": 8177556096, + "step": 15600 + }, + { + "epoch": 0.1493017875669294, + "grad_norm": 0.13515712320804596, + "learning_rate": 0.001, + "loss": 2.317, + "num_input_tokens_seen": 8203769152, + "step": 15650 + }, + { + "epoch": 0.1497787900831177, + "grad_norm": 0.1321142017841339, + "learning_rate": 0.001, + "loss": 2.318, + "num_input_tokens_seen": 8229969312, + "step": 15700 + }, + { + "epoch": 0.15025579259930597, + "grad_norm": 0.13010093569755554, + "learning_rate": 0.001, + "loss": 2.3291, + "num_input_tokens_seen": 8256183456, + "step": 15750 + }, + { + "epoch": 0.15073279511549423, + "grad_norm": 0.13135819137096405, + "learning_rate": 0.001, + "loss": 2.3304, + "num_input_tokens_seen": 8282391392, + "step": 15800 + }, + { + "epoch": 0.15120979763168252, + "grad_norm": 0.13832679390907288, + "learning_rate": 0.001, + "loss": 2.3206, + "num_input_tokens_seen": 8308598656, + "step": 15850 + }, + { + "epoch": 0.15168680014787078, + "grad_norm": 0.14133113622665405, + "learning_rate": 0.001, + "loss": 2.3201, + "num_input_tokens_seen": 8334808960, + "step": 15900 + }, + { + "epoch": 0.15216380266405904, + "grad_norm": 0.12465903908014297, + "learning_rate": 0.001, + "loss": 2.3235, + "num_input_tokens_seen": 8361020256, + "step": 15950 + }, + { + "epoch": 0.15264080518024734, + "grad_norm": 0.1318390965461731, + "learning_rate": 0.001, + "loss": 2.3103, + "num_input_tokens_seen": 8387218560, + "step": 16000 + }, + { + "epoch": 0.15264080518024734, + "eval_loss": 2.2365996837615967, + "eval_runtime": 82.6838, + "eval_samples_per_second": 60.471, + "eval_steps_per_second": 15.118, + "num_input_tokens_seen": 8387218560, + "step": 16000 + }, + { + "epoch": 0.1531178076964356, + "grad_norm": 0.1479504108428955, + "learning_rate": 0.001, + "loss": 2.3222, + "num_input_tokens_seen": 8413431680, + "step": 16050 + }, + { + "epoch": 0.15359481021262386, + "grad_norm": 0.12534798681735992, + "learning_rate": 0.001, + "loss": 2.3064, + "num_input_tokens_seen": 8439639584, + "step": 16100 + }, + { + "epoch": 0.15407181272881215, + "grad_norm": 0.13538773357868195, + "learning_rate": 0.001, + "loss": 2.3156, + "num_input_tokens_seen": 8465838816, + "step": 16150 + }, + { + "epoch": 0.15454881524500041, + "grad_norm": 0.132590189576149, + "learning_rate": 0.001, + "loss": 2.3237, + "num_input_tokens_seen": 8492046144, + "step": 16200 + }, + { + "epoch": 0.15502581776118868, + "grad_norm": 0.15315937995910645, + "learning_rate": 0.001, + "loss": 2.3082, + "num_input_tokens_seen": 8518256608, + "step": 16250 + }, + { + "epoch": 0.15550282027737697, + "grad_norm": 0.14311794936656952, + "learning_rate": 0.001, + "loss": 2.3135, + "num_input_tokens_seen": 8544471008, + "step": 16300 + }, + { + "epoch": 0.15597982279356523, + "grad_norm": 0.13563624024391174, + "learning_rate": 0.001, + "loss": 2.319, + "num_input_tokens_seen": 8570685408, + "step": 16350 + }, + { + "epoch": 0.15645682530975352, + "grad_norm": 0.12712624669075012, + "learning_rate": 0.001, + "loss": 2.3216, + "num_input_tokens_seen": 8596898464, + "step": 16400 + }, + { + "epoch": 0.15693382782594179, + "grad_norm": 0.12751208245754242, + "learning_rate": 0.001, + "loss": 2.31, + "num_input_tokens_seen": 8623111776, + "step": 16450 + }, + { + "epoch": 0.15741083034213005, + "grad_norm": 0.1371571272611618, + "learning_rate": 0.001, + "loss": 2.3137, + "num_input_tokens_seen": 8649321536, + "step": 16500 + }, + { + "epoch": 0.15741083034213005, + "eval_loss": 2.232142210006714, + "eval_runtime": 82.2631, + "eval_samples_per_second": 60.781, + "eval_steps_per_second": 15.195, + "num_input_tokens_seen": 8649321536, + "step": 16500 + }, + { + "epoch": 0.15788783285831834, + "grad_norm": 0.1343661993741989, + "learning_rate": 0.001, + "loss": 2.313, + "num_input_tokens_seen": 8675529792, + "step": 16550 + }, + { + "epoch": 0.1583648353745066, + "grad_norm": 0.14035946130752563, + "learning_rate": 0.001, + "loss": 2.3097, + "num_input_tokens_seen": 8701739712, + "step": 16600 + }, + { + "epoch": 0.15884183789069486, + "grad_norm": 0.12256618589162827, + "learning_rate": 0.001, + "loss": 2.3102, + "num_input_tokens_seen": 8727951616, + "step": 16650 + }, + { + "epoch": 0.15931884040688316, + "grad_norm": 0.1355251669883728, + "learning_rate": 0.001, + "loss": 2.3099, + "num_input_tokens_seen": 8754160960, + "step": 16700 + }, + { + "epoch": 0.15979584292307142, + "grad_norm": 0.13105979561805725, + "learning_rate": 0.001, + "loss": 2.3075, + "num_input_tokens_seen": 8780369344, + "step": 16750 + }, + { + "epoch": 0.16027284543925968, + "grad_norm": 0.13410349190235138, + "learning_rate": 0.001, + "loss": 2.3134, + "num_input_tokens_seen": 8806583648, + "step": 16800 + }, + { + "epoch": 0.16074984795544797, + "grad_norm": 0.13738510012626648, + "learning_rate": 0.001, + "loss": 2.3051, + "num_input_tokens_seen": 8832796864, + "step": 16850 + }, + { + "epoch": 0.16122685047163623, + "grad_norm": 0.13892224431037903, + "learning_rate": 0.001, + "loss": 2.3243, + "num_input_tokens_seen": 8859005632, + "step": 16900 + }, + { + "epoch": 0.1617038529878245, + "grad_norm": 0.12879416346549988, + "learning_rate": 0.001, + "loss": 2.3123, + "num_input_tokens_seen": 8885216960, + "step": 16950 + }, + { + "epoch": 0.1621808555040128, + "grad_norm": 0.1300731897354126, + "learning_rate": 0.001, + "loss": 2.3148, + "num_input_tokens_seen": 8911431360, + "step": 17000 + }, + { + "epoch": 0.1621808555040128, + "eval_loss": 2.2285797595977783, + "eval_runtime": 82.7553, + "eval_samples_per_second": 60.419, + "eval_steps_per_second": 15.105, + "num_input_tokens_seen": 8911431360, + "step": 17000 + }, + { + "epoch": 0.16265785802020105, + "grad_norm": 0.13246452808380127, + "learning_rate": 0.001, + "loss": 2.3083, + "num_input_tokens_seen": 8937641184, + "step": 17050 + }, + { + "epoch": 0.16313486053638934, + "grad_norm": 0.1408887803554535, + "learning_rate": 0.001, + "loss": 2.311, + "num_input_tokens_seen": 8963855584, + "step": 17100 + }, + { + "epoch": 0.1636118630525776, + "grad_norm": 0.13497628271579742, + "learning_rate": 0.001, + "loss": 2.3075, + "num_input_tokens_seen": 8990067520, + "step": 17150 + }, + { + "epoch": 0.16408886556876587, + "grad_norm": 0.13361407816410065, + "learning_rate": 0.001, + "loss": 2.3048, + "num_input_tokens_seen": 9016266240, + "step": 17200 + }, + { + "epoch": 0.16456586808495416, + "grad_norm": 0.145442932844162, + "learning_rate": 0.001, + "loss": 2.3086, + "num_input_tokens_seen": 9042480000, + "step": 17250 + }, + { + "epoch": 0.16504287060114242, + "grad_norm": 0.12842726707458496, + "learning_rate": 0.001, + "loss": 2.3029, + "num_input_tokens_seen": 9068694400, + "step": 17300 + }, + { + "epoch": 0.16551987311733068, + "grad_norm": 0.14847566187381744, + "learning_rate": 0.001, + "loss": 2.3188, + "num_input_tokens_seen": 9094902272, + "step": 17350 + }, + { + "epoch": 0.16599687563351898, + "grad_norm": 0.13063114881515503, + "learning_rate": 0.001, + "loss": 2.297, + "num_input_tokens_seen": 9121110464, + "step": 17400 + }, + { + "epoch": 0.16647387814970724, + "grad_norm": 0.16154611110687256, + "learning_rate": 0.001, + "loss": 2.3122, + "num_input_tokens_seen": 9147321632, + "step": 17450 + }, + { + "epoch": 0.1669508806658955, + "grad_norm": 0.12539538741111755, + "learning_rate": 0.001, + "loss": 2.3076, + "num_input_tokens_seen": 9173533056, + "step": 17500 + }, + { + "epoch": 0.1669508806658955, + "eval_loss": 2.2225582599639893, + "eval_runtime": 82.4465, + "eval_samples_per_second": 60.645, + "eval_steps_per_second": 15.161, + "num_input_tokens_seen": 9173533056, + "step": 17500 + }, + { + "epoch": 0.1674278831820838, + "grad_norm": 0.1455305516719818, + "learning_rate": 0.001, + "loss": 2.2964, + "num_input_tokens_seen": 9199741376, + "step": 17550 + }, + { + "epoch": 0.16790488569827206, + "grad_norm": 0.1348162442445755, + "learning_rate": 0.001, + "loss": 2.3055, + "num_input_tokens_seen": 9225953984, + "step": 17600 + }, + { + "epoch": 0.16838188821446032, + "grad_norm": 0.1430789977312088, + "learning_rate": 0.001, + "loss": 2.309, + "num_input_tokens_seen": 9252159616, + "step": 17650 + }, + { + "epoch": 0.1688588907306486, + "grad_norm": 0.14652392268180847, + "learning_rate": 0.001, + "loss": 2.3052, + "num_input_tokens_seen": 9278371392, + "step": 17700 + }, + { + "epoch": 0.16933589324683687, + "grad_norm": 0.13538667559623718, + "learning_rate": 0.001, + "loss": 2.3147, + "num_input_tokens_seen": 9304572736, + "step": 17750 + }, + { + "epoch": 0.16981289576302516, + "grad_norm": 0.13386596739292145, + "learning_rate": 0.001, + "loss": 2.3044, + "num_input_tokens_seen": 9330787136, + "step": 17800 + }, + { + "epoch": 0.17028989827921343, + "grad_norm": 0.1391988843679428, + "learning_rate": 0.001, + "loss": 2.2956, + "num_input_tokens_seen": 9357001536, + "step": 17850 + }, + { + "epoch": 0.1707669007954017, + "grad_norm": 0.13184039294719696, + "learning_rate": 0.001, + "loss": 2.2965, + "num_input_tokens_seen": 9383215936, + "step": 17900 + }, + { + "epoch": 0.17124390331158998, + "grad_norm": 0.14412756264209747, + "learning_rate": 0.001, + "loss": 2.292, + "num_input_tokens_seen": 9409427392, + "step": 17950 + }, + { + "epoch": 0.17172090582777824, + "grad_norm": 0.12889249622821808, + "learning_rate": 0.001, + "loss": 2.2963, + "num_input_tokens_seen": 9435637536, + "step": 18000 + }, + { + "epoch": 0.17172090582777824, + "eval_loss": 2.216590166091919, + "eval_runtime": 82.2621, + "eval_samples_per_second": 60.781, + "eval_steps_per_second": 15.195, + "num_input_tokens_seen": 9435637536, + "step": 18000 + }, + { + "epoch": 0.1721979083439665, + "grad_norm": 0.1292746514081955, + "learning_rate": 0.001, + "loss": 2.3103, + "num_input_tokens_seen": 9461834176, + "step": 18050 + }, + { + "epoch": 0.1726749108601548, + "grad_norm": 0.13079434633255005, + "learning_rate": 0.001, + "loss": 2.3045, + "num_input_tokens_seen": 9488041024, + "step": 18100 + }, + { + "epoch": 0.17315191337634306, + "grad_norm": 0.1451425701379776, + "learning_rate": 0.001, + "loss": 2.3127, + "num_input_tokens_seen": 9514248512, + "step": 18150 + }, + { + "epoch": 0.17362891589253132, + "grad_norm": 0.14286376535892487, + "learning_rate": 0.001, + "loss": 2.296, + "num_input_tokens_seen": 9540460992, + "step": 18200 + }, + { + "epoch": 0.1741059184087196, + "grad_norm": 0.14793863892555237, + "learning_rate": 0.001, + "loss": 2.295, + "num_input_tokens_seen": 9566675392, + "step": 18250 + }, + { + "epoch": 0.17458292092490788, + "grad_norm": 0.13479390740394592, + "learning_rate": 0.001, + "loss": 2.2925, + "num_input_tokens_seen": 9592885152, + "step": 18300 + }, + { + "epoch": 0.17505992344109614, + "grad_norm": 0.14160257577896118, + "learning_rate": 0.001, + "loss": 2.2984, + "num_input_tokens_seen": 9619098336, + "step": 18350 + }, + { + "epoch": 0.17553692595728443, + "grad_norm": 0.1370360404253006, + "learning_rate": 0.001, + "loss": 2.283, + "num_input_tokens_seen": 9645312736, + "step": 18400 + }, + { + "epoch": 0.1760139284734727, + "grad_norm": 0.13573038578033447, + "learning_rate": 0.001, + "loss": 2.2902, + "num_input_tokens_seen": 9671524352, + "step": 18450 + }, + { + "epoch": 0.17649093098966098, + "grad_norm": 0.14134661853313446, + "learning_rate": 0.001, + "loss": 2.3052, + "num_input_tokens_seen": 9697738752, + "step": 18500 + }, + { + "epoch": 0.17649093098966098, + "eval_loss": 2.2122554779052734, + "eval_runtime": 83.2223, + "eval_samples_per_second": 60.08, + "eval_steps_per_second": 15.02, + "num_input_tokens_seen": 9697738752, + "step": 18500 + }, + { + "epoch": 0.17696793350584925, + "grad_norm": 0.13307662308216095, + "learning_rate": 0.001, + "loss": 2.2964, + "num_input_tokens_seen": 9723948800, + "step": 18550 + }, + { + "epoch": 0.1774449360220375, + "grad_norm": 0.14741794764995575, + "learning_rate": 0.001, + "loss": 2.2947, + "num_input_tokens_seen": 9750163200, + "step": 18600 + }, + { + "epoch": 0.1779219385382258, + "grad_norm": 0.1431114673614502, + "learning_rate": 0.001, + "loss": 2.299, + "num_input_tokens_seen": 9776369696, + "step": 18650 + }, + { + "epoch": 0.17839894105441406, + "grad_norm": 0.1539929211139679, + "learning_rate": 0.001, + "loss": 2.2949, + "num_input_tokens_seen": 9802580192, + "step": 18700 + }, + { + "epoch": 0.17887594357060232, + "grad_norm": 0.13433188199996948, + "learning_rate": 0.001, + "loss": 2.2964, + "num_input_tokens_seen": 9828792608, + "step": 18750 + }, + { + "epoch": 0.17935294608679062, + "grad_norm": 0.12964121997356415, + "learning_rate": 0.001, + "loss": 2.2981, + "num_input_tokens_seen": 9855007008, + "step": 18800 + }, + { + "epoch": 0.17982994860297888, + "grad_norm": 0.1349261850118637, + "learning_rate": 0.001, + "loss": 2.2875, + "num_input_tokens_seen": 9881218528, + "step": 18850 + }, + { + "epoch": 0.18030695111916714, + "grad_norm": 0.12905199825763702, + "learning_rate": 0.001, + "loss": 2.2973, + "num_input_tokens_seen": 9907428192, + "step": 18900 + }, + { + "epoch": 0.18078395363535543, + "grad_norm": 0.13705725967884064, + "learning_rate": 0.001, + "loss": 2.2936, + "num_input_tokens_seen": 9933638112, + "step": 18950 + }, + { + "epoch": 0.1812609561515437, + "grad_norm": 0.13736732304096222, + "learning_rate": 0.001, + "loss": 2.2941, + "num_input_tokens_seen": 9959851776, + "step": 19000 + }, + { + "epoch": 0.1812609561515437, + "eval_loss": 2.2090442180633545, + "eval_runtime": 82.3122, + "eval_samples_per_second": 60.744, + "eval_steps_per_second": 15.186, + "num_input_tokens_seen": 9959851776, + "step": 19000 + }, + { + "epoch": 0.18173795866773199, + "grad_norm": 0.14507094025611877, + "learning_rate": 0.001, + "loss": 2.2873, + "num_input_tokens_seen": 9986063136, + "step": 19050 + }, + { + "epoch": 0.18221496118392025, + "grad_norm": 0.14904463291168213, + "learning_rate": 0.001, + "loss": 2.2857, + "num_input_tokens_seen": 10012277344, + "step": 19100 + }, + { + "epoch": 0.1826919637001085, + "grad_norm": 0.1437740921974182, + "learning_rate": 0.001, + "loss": 2.293, + "num_input_tokens_seen": 10038487648, + "step": 19150 + }, + { + "epoch": 0.1831689662162968, + "grad_norm": 0.13508464395999908, + "learning_rate": 0.001, + "loss": 2.2888, + "num_input_tokens_seen": 10064701312, + "step": 19200 + }, + { + "epoch": 0.18364596873248507, + "grad_norm": 0.1317240297794342, + "learning_rate": 0.001, + "loss": 2.2916, + "num_input_tokens_seen": 10090910496, + "step": 19250 + }, + { + "epoch": 0.18412297124867333, + "grad_norm": 0.13427771627902985, + "learning_rate": 0.001, + "loss": 2.2861, + "num_input_tokens_seen": 10117124896, + "step": 19300 + }, + { + "epoch": 0.18459997376486162, + "grad_norm": 0.17408016324043274, + "learning_rate": 0.001, + "loss": 2.2826, + "num_input_tokens_seen": 10143339296, + "step": 19350 + }, + { + "epoch": 0.18507697628104988, + "grad_norm": 0.15968067944049835, + "learning_rate": 0.001, + "loss": 2.3486, + "num_input_tokens_seen": 10169540896, + "step": 19400 + }, + { + "epoch": 0.18555397879723814, + "grad_norm": 0.12174613028764725, + "learning_rate": 0.001, + "loss": 2.2963, + "num_input_tokens_seen": 10195751040, + "step": 19450 + }, + { + "epoch": 0.18603098131342644, + "grad_norm": 0.1349005550146103, + "learning_rate": 0.001, + "loss": 2.2888, + "num_input_tokens_seen": 10221963136, + "step": 19500 + }, + { + "epoch": 0.18603098131342644, + "eval_loss": 2.2117698192596436, + "eval_runtime": 81.7726, + "eval_samples_per_second": 61.145, + "eval_steps_per_second": 15.286, + "num_input_tokens_seen": 10221963136, + "step": 19500 + }, + { + "epoch": 0.1865079838296147, + "grad_norm": 0.12541209161281586, + "learning_rate": 0.001, + "loss": 2.2966, + "num_input_tokens_seen": 10248173024, + "step": 19550 + }, + { + "epoch": 0.18698498634580296, + "grad_norm": 0.9949402213096619, + "learning_rate": 0.001, + "loss": 2.2825, + "num_input_tokens_seen": 10274379520, + "step": 19600 + }, + { + "epoch": 0.18746198886199125, + "grad_norm": 0.13587036728858948, + "learning_rate": 0.001, + "loss": 2.2963, + "num_input_tokens_seen": 10300591040, + "step": 19650 + }, + { + "epoch": 0.18793899137817952, + "grad_norm": 0.14047515392303467, + "learning_rate": 0.001, + "loss": 2.2871, + "num_input_tokens_seen": 10326800928, + "step": 19700 + }, + { + "epoch": 0.1884159938943678, + "grad_norm": 0.13005691766738892, + "learning_rate": 0.001, + "loss": 2.3058, + "num_input_tokens_seen": 10353009568, + "step": 19750 + }, + { + "epoch": 0.18889299641055607, + "grad_norm": 0.13120286166667938, + "learning_rate": 0.001, + "loss": 2.2927, + "num_input_tokens_seen": 10379210048, + "step": 19800 + }, + { + "epoch": 0.18936999892674433, + "grad_norm": 0.14059720933437347, + "learning_rate": 0.001, + "loss": 2.2887, + "num_input_tokens_seen": 10405422080, + "step": 19850 + }, + { + "epoch": 0.18984700144293262, + "grad_norm": 0.13072331249713898, + "learning_rate": 0.001, + "loss": 2.2928, + "num_input_tokens_seen": 10431635744, + "step": 19900 + }, + { + "epoch": 0.19032400395912089, + "grad_norm": 0.14114826917648315, + "learning_rate": 0.001, + "loss": 2.284, + "num_input_tokens_seen": 10457844768, + "step": 19950 + }, + { + "epoch": 0.19080100647530915, + "grad_norm": 0.13289280235767365, + "learning_rate": 0.001, + "loss": 2.2894, + "num_input_tokens_seen": 10484059168, + "step": 20000 + }, + { + "epoch": 0.19080100647530915, + "eval_loss": 2.20172381401062, + "eval_runtime": 82.0311, + "eval_samples_per_second": 60.952, + "eval_steps_per_second": 15.238, + "num_input_tokens_seen": 10484059168, + "step": 20000 + }, + { + "epoch": 0.19127800899149744, + "grad_norm": 0.14763779938220978, + "learning_rate": 0.001, + "loss": 2.2901, + "num_input_tokens_seen": 10510273568, + "step": 20050 + }, + { + "epoch": 0.1917550115076857, + "grad_norm": 0.13675181567668915, + "learning_rate": 0.001, + "loss": 2.2809, + "num_input_tokens_seen": 10536486432, + "step": 20100 + }, + { + "epoch": 0.19223201402387396, + "grad_norm": 0.13765814900398254, + "learning_rate": 0.001, + "loss": 2.289, + "num_input_tokens_seen": 10562700832, + "step": 20150 + }, + { + "epoch": 0.19270901654006226, + "grad_norm": 0.1395033895969391, + "learning_rate": 0.001, + "loss": 2.286, + "num_input_tokens_seen": 10588891776, + "step": 20200 + }, + { + "epoch": 0.19318601905625052, + "grad_norm": 0.14209134876728058, + "learning_rate": 0.001, + "loss": 2.2805, + "num_input_tokens_seen": 10615106176, + "step": 20250 + }, + { + "epoch": 0.19366302157243878, + "grad_norm": 0.1354246586561203, + "learning_rate": 0.001, + "loss": 2.2817, + "num_input_tokens_seen": 10641312192, + "step": 20300 + }, + { + "epoch": 0.19414002408862707, + "grad_norm": 0.1305360049009323, + "learning_rate": 0.001, + "loss": 2.2871, + "num_input_tokens_seen": 10667526080, + "step": 20350 + }, + { + "epoch": 0.19461702660481534, + "grad_norm": 0.13948604464530945, + "learning_rate": 0.001, + "loss": 2.2841, + "num_input_tokens_seen": 10693737664, + "step": 20400 + }, + { + "epoch": 0.19509402912100363, + "grad_norm": 0.12424025684595108, + "learning_rate": 0.001, + "loss": 2.2838, + "num_input_tokens_seen": 10719951328, + "step": 20450 + }, + { + "epoch": 0.1955710316371919, + "grad_norm": 0.14923156797885895, + "learning_rate": 0.001, + "loss": 2.2882, + "num_input_tokens_seen": 10746164768, + "step": 20500 + }, + { + "epoch": 0.1955710316371919, + "eval_loss": 2.1973979473114014, + "eval_runtime": 82.5785, + "eval_samples_per_second": 60.548, + "eval_steps_per_second": 15.137, + "num_input_tokens_seen": 10746164768, + "step": 20500 + }, + { + "epoch": 0.19604803415338015, + "grad_norm": 0.14104098081588745, + "learning_rate": 0.001, + "loss": 2.2842, + "num_input_tokens_seen": 10772366272, + "step": 20550 + }, + { + "epoch": 0.19652503666956844, + "grad_norm": 0.1297464370727539, + "learning_rate": 0.001, + "loss": 2.2835, + "num_input_tokens_seen": 10798576992, + "step": 20600 + }, + { + "epoch": 0.1970020391857567, + "grad_norm": 0.1436595320701599, + "learning_rate": 0.001, + "loss": 2.2744, + "num_input_tokens_seen": 10824786016, + "step": 20650 + }, + { + "epoch": 0.19747904170194497, + "grad_norm": 0.14249320328235626, + "learning_rate": 0.001, + "loss": 2.2823, + "num_input_tokens_seen": 10850990816, + "step": 20700 + }, + { + "epoch": 0.19795604421813326, + "grad_norm": 0.14356642961502075, + "learning_rate": 0.001, + "loss": 2.2891, + "num_input_tokens_seen": 10877198080, + "step": 20750 + }, + { + "epoch": 0.19843304673432152, + "grad_norm": 0.13429990410804749, + "learning_rate": 0.001, + "loss": 2.2786, + "num_input_tokens_seen": 10903412480, + "step": 20800 + }, + { + "epoch": 0.19891004925050979, + "grad_norm": 0.1445857435464859, + "learning_rate": 0.001, + "loss": 2.2761, + "num_input_tokens_seen": 10929623488, + "step": 20850 + }, + { + "epoch": 0.19938705176669808, + "grad_norm": 0.13351799547672272, + "learning_rate": 0.001, + "loss": 2.2801, + "num_input_tokens_seen": 10955835264, + "step": 20900 + }, + { + "epoch": 0.19986405428288634, + "grad_norm": 0.13249842822551727, + "learning_rate": 0.001, + "loss": 2.2807, + "num_input_tokens_seen": 10982046176, + "step": 20950 + }, + { + "epoch": 0.2003410567990746, + "grad_norm": 0.12836948037147522, + "learning_rate": 0.001, + "loss": 2.2677, + "num_input_tokens_seen": 11008255872, + "step": 21000 + }, + { + "epoch": 0.2003410567990746, + "eval_loss": 2.1926751136779785, + "eval_runtime": 82.0402, + "eval_samples_per_second": 60.946, + "eval_steps_per_second": 15.236, + "num_input_tokens_seen": 11008255872, + "step": 21000 + }, + { + "epoch": 0.2008180593152629, + "grad_norm": 0.1373811513185501, + "learning_rate": 0.001, + "loss": 2.2797, + "num_input_tokens_seen": 11034461376, + "step": 21050 + }, + { + "epoch": 0.20129506183145116, + "grad_norm": 0.130074143409729, + "learning_rate": 0.001, + "loss": 2.2607, + "num_input_tokens_seen": 11060670400, + "step": 21100 + }, + { + "epoch": 0.20177206434763945, + "grad_norm": 0.13792483508586884, + "learning_rate": 0.001, + "loss": 2.2676, + "num_input_tokens_seen": 11086880960, + "step": 21150 + }, + { + "epoch": 0.2022490668638277, + "grad_norm": 0.1272813379764557, + "learning_rate": 0.001, + "loss": 2.2728, + "num_input_tokens_seen": 11113093024, + "step": 21200 + }, + { + "epoch": 0.20272606938001597, + "grad_norm": 0.1411881297826767, + "learning_rate": 0.001, + "loss": 2.2725, + "num_input_tokens_seen": 11139305248, + "step": 21250 + }, + { + "epoch": 0.20320307189620426, + "grad_norm": 0.15611988306045532, + "learning_rate": 0.001, + "loss": 2.2689, + "num_input_tokens_seen": 11165511296, + "step": 21300 + }, + { + "epoch": 0.20368007441239253, + "grad_norm": 0.13627928495407104, + "learning_rate": 0.001, + "loss": 2.2721, + "num_input_tokens_seen": 11191723008, + "step": 21350 + }, + { + "epoch": 0.2041570769285808, + "grad_norm": 0.14451804757118225, + "learning_rate": 0.001, + "loss": 2.2693, + "num_input_tokens_seen": 11217937408, + "step": 21400 + }, + { + "epoch": 0.20463407944476908, + "grad_norm": 0.1419762820005417, + "learning_rate": 0.001, + "loss": 2.2654, + "num_input_tokens_seen": 11244151808, + "step": 21450 + }, + { + "epoch": 0.20511108196095734, + "grad_norm": 0.139862060546875, + "learning_rate": 0.001, + "loss": 2.2577, + "num_input_tokens_seen": 11270362240, + "step": 21500 + }, + { + "epoch": 0.20511108196095734, + "eval_loss": 2.190119981765747, + "eval_runtime": 82.6744, + "eval_samples_per_second": 60.478, + "eval_steps_per_second": 15.12, + "num_input_tokens_seen": 11270362240, + "step": 21500 + }, + { + "epoch": 0.2055880844771456, + "grad_norm": 0.13659726083278656, + "learning_rate": 0.001, + "loss": 2.273, + "num_input_tokens_seen": 11296576640, + "step": 21550 + }, + { + "epoch": 0.2060650869933339, + "grad_norm": 0.12730096280574799, + "learning_rate": 0.001, + "loss": 2.2651, + "num_input_tokens_seen": 11322782720, + "step": 21600 + }, + { + "epoch": 0.20654208950952216, + "grad_norm": 0.1489386260509491, + "learning_rate": 0.001, + "loss": 2.279, + "num_input_tokens_seen": 11348986624, + "step": 21650 + }, + { + "epoch": 0.20701909202571045, + "grad_norm": 0.13576173782348633, + "learning_rate": 0.001, + "loss": 2.2604, + "num_input_tokens_seen": 11375197504, + "step": 21700 + }, + { + "epoch": 0.2074960945418987, + "grad_norm": 0.15627992153167725, + "learning_rate": 0.001, + "loss": 2.2675, + "num_input_tokens_seen": 11401405824, + "step": 21750 + }, + { + "epoch": 0.20797309705808698, + "grad_norm": 0.14521074295043945, + "learning_rate": 0.001, + "loss": 2.2696, + "num_input_tokens_seen": 11427616352, + "step": 21800 + }, + { + "epoch": 0.20845009957427527, + "grad_norm": 0.15713635087013245, + "learning_rate": 0.001, + "loss": 2.2763, + "num_input_tokens_seen": 11453820544, + "step": 21850 + }, + { + "epoch": 0.20892710209046353, + "grad_norm": 0.15573829412460327, + "learning_rate": 0.001, + "loss": 2.2675, + "num_input_tokens_seen": 11480031968, + "step": 21900 + }, + { + "epoch": 0.2094041046066518, + "grad_norm": 0.1381770819425583, + "learning_rate": 0.001, + "loss": 2.2698, + "num_input_tokens_seen": 11506246368, + "step": 21950 + }, + { + "epoch": 0.20988110712284008, + "grad_norm": 0.17163416743278503, + "learning_rate": 0.001, + "loss": 2.2725, + "num_input_tokens_seen": 11532457408, + "step": 22000 + }, + { + "epoch": 0.20988110712284008, + "eval_loss": 2.1856114864349365, + "eval_runtime": 82.4539, + "eval_samples_per_second": 60.64, + "eval_steps_per_second": 15.16, + "num_input_tokens_seen": 11532457408, + "step": 22000 + }, + { + "epoch": 0.21035810963902835, + "grad_norm": 0.13742762804031372, + "learning_rate": 0.001, + "loss": 2.2819, + "num_input_tokens_seen": 11558665024, + "step": 22050 + }, + { + "epoch": 0.2108351121552166, + "grad_norm": 0.1606198400259018, + "learning_rate": 0.001, + "loss": 2.2809, + "num_input_tokens_seen": 11584879424, + "step": 22100 + }, + { + "epoch": 0.2113121146714049, + "grad_norm": 0.1447242647409439, + "learning_rate": 0.001, + "loss": 2.272, + "num_input_tokens_seen": 11611093824, + "step": 22150 + }, + { + "epoch": 0.21178911718759316, + "grad_norm": 0.14127366244792938, + "learning_rate": 0.001, + "loss": 2.2605, + "num_input_tokens_seen": 11637306304, + "step": 22200 + }, + { + "epoch": 0.21226611970378143, + "grad_norm": 0.13236087560653687, + "learning_rate": 0.001, + "loss": 2.2676, + "num_input_tokens_seen": 11663514912, + "step": 22250 + }, + { + "epoch": 0.21274312221996972, + "grad_norm": 0.131170392036438, + "learning_rate": 0.001, + "loss": 2.2573, + "num_input_tokens_seen": 11689716960, + "step": 22300 + }, + { + "epoch": 0.21322012473615798, + "grad_norm": 0.16254200041294098, + "learning_rate": 0.001, + "loss": 2.252, + "num_input_tokens_seen": 11715924704, + "step": 22350 + }, + { + "epoch": 0.21369712725234627, + "grad_norm": 0.14250585436820984, + "learning_rate": 0.001, + "loss": 2.2739, + "num_input_tokens_seen": 11742135840, + "step": 22400 + }, + { + "epoch": 0.21417412976853453, + "grad_norm": 0.131143257021904, + "learning_rate": 0.001, + "loss": 2.2661, + "num_input_tokens_seen": 11768347232, + "step": 22450 + }, + { + "epoch": 0.2146511322847228, + "grad_norm": 0.13916635513305664, + "learning_rate": 0.001, + "loss": 2.2519, + "num_input_tokens_seen": 11794558656, + "step": 22500 + }, + { + "epoch": 0.2146511322847228, + "eval_loss": 2.183870315551758, + "eval_runtime": 82.7383, + "eval_samples_per_second": 60.431, + "eval_steps_per_second": 15.108, + "num_input_tokens_seen": 11794558656, + "step": 22500 + }, + { + "epoch": 0.21512813480091109, + "grad_norm": 0.14609429240226746, + "learning_rate": 0.001, + "loss": 2.2588, + "num_input_tokens_seen": 11820771552, + "step": 22550 + }, + { + "epoch": 0.21560513731709935, + "grad_norm": 0.140402689576149, + "learning_rate": 0.001, + "loss": 2.2826, + "num_input_tokens_seen": 11846982720, + "step": 22600 + }, + { + "epoch": 0.2160821398332876, + "grad_norm": 0.14499905705451965, + "learning_rate": 0.001, + "loss": 2.2704, + "num_input_tokens_seen": 11873196512, + "step": 22650 + }, + { + "epoch": 0.2165591423494759, + "grad_norm": 0.14119970798492432, + "learning_rate": 0.001, + "loss": 2.2564, + "num_input_tokens_seen": 11899404224, + "step": 22700 + }, + { + "epoch": 0.21703614486566417, + "grad_norm": 0.13618482649326324, + "learning_rate": 0.001, + "loss": 2.2616, + "num_input_tokens_seen": 11925615904, + "step": 22750 + }, + { + "epoch": 0.21751314738185243, + "grad_norm": 0.15894031524658203, + "learning_rate": 0.001, + "loss": 2.2826, + "num_input_tokens_seen": 11951821216, + "step": 22800 + }, + { + "epoch": 0.21799014989804072, + "grad_norm": 0.13335183262825012, + "learning_rate": 0.001, + "loss": 2.2615, + "num_input_tokens_seen": 11978025056, + "step": 22850 + }, + { + "epoch": 0.21846715241422898, + "grad_norm": 0.1391170769929886, + "learning_rate": 0.001, + "loss": 2.2677, + "num_input_tokens_seen": 12004238368, + "step": 22900 + }, + { + "epoch": 0.21894415493041725, + "grad_norm": 0.14966392517089844, + "learning_rate": 0.001, + "loss": 2.2742, + "num_input_tokens_seen": 12030450848, + "step": 22950 + }, + { + "epoch": 0.21942115744660554, + "grad_norm": 0.15453237295150757, + "learning_rate": 0.001, + "loss": 2.266, + "num_input_tokens_seen": 12056655104, + "step": 23000 + }, + { + "epoch": 0.21942115744660554, + "eval_loss": 2.17928409576416, + "eval_runtime": 82.7688, + "eval_samples_per_second": 60.409, + "eval_steps_per_second": 15.102, + "num_input_tokens_seen": 12056655104, + "step": 23000 + }, + { + "epoch": 0.2198981599627938, + "grad_norm": 0.1397433876991272, + "learning_rate": 0.001, + "loss": 2.2649, + "num_input_tokens_seen": 12082858944, + "step": 23050 + }, + { + "epoch": 0.2203751624789821, + "grad_norm": 0.13647589087486267, + "learning_rate": 0.001, + "loss": 2.2639, + "num_input_tokens_seen": 12109067872, + "step": 23100 + }, + { + "epoch": 0.22085216499517035, + "grad_norm": 0.1422584354877472, + "learning_rate": 0.001, + "loss": 2.2641, + "num_input_tokens_seen": 12135282272, + "step": 23150 + }, + { + "epoch": 0.22132916751135862, + "grad_norm": 0.14315859973430634, + "learning_rate": 0.001, + "loss": 2.2587, + "num_input_tokens_seen": 12161491840, + "step": 23200 + }, + { + "epoch": 0.2218061700275469, + "grad_norm": 0.14624252915382385, + "learning_rate": 0.001, + "loss": 2.2658, + "num_input_tokens_seen": 12187700480, + "step": 23250 + }, + { + "epoch": 0.22228317254373517, + "grad_norm": 0.14765731990337372, + "learning_rate": 0.001, + "loss": 2.263, + "num_input_tokens_seen": 12213907680, + "step": 23300 + }, + { + "epoch": 0.22276017505992343, + "grad_norm": 0.15279778838157654, + "learning_rate": 0.001, + "loss": 2.2529, + "num_input_tokens_seen": 12240118848, + "step": 23350 + }, + { + "epoch": 0.22323717757611172, + "grad_norm": 0.1480414867401123, + "learning_rate": 0.001, + "loss": 2.2545, + "num_input_tokens_seen": 12266329376, + "step": 23400 + }, + { + "epoch": 0.22371418009229999, + "grad_norm": 0.1284361481666565, + "learning_rate": 0.001, + "loss": 2.2659, + "num_input_tokens_seen": 12292540960, + "step": 23450 + }, + { + "epoch": 0.22419118260848825, + "grad_norm": 0.138748899102211, + "learning_rate": 0.001, + "loss": 2.2531, + "num_input_tokens_seen": 12318747360, + "step": 23500 + }, + { + "epoch": 0.22419118260848825, + "eval_loss": 2.17672061920166, + "eval_runtime": 82.9579, + "eval_samples_per_second": 60.272, + "eval_steps_per_second": 15.068, + "num_input_tokens_seen": 12318747360, + "step": 23500 + }, + { + "epoch": 0.22466818512467654, + "grad_norm": 0.13704177737236023, + "learning_rate": 0.001, + "loss": 2.2553, + "num_input_tokens_seen": 12344950880, + "step": 23550 + }, + { + "epoch": 0.2251451876408648, + "grad_norm": 0.1447945088148117, + "learning_rate": 0.001, + "loss": 2.2516, + "num_input_tokens_seen": 12371157184, + "step": 23600 + }, + { + "epoch": 0.22562219015705307, + "grad_norm": 0.13667277991771698, + "learning_rate": 0.001, + "loss": 2.2556, + "num_input_tokens_seen": 12397370368, + "step": 23650 + }, + { + "epoch": 0.22609919267324136, + "grad_norm": 0.13712671399116516, + "learning_rate": 0.001, + "loss": 2.2511, + "num_input_tokens_seen": 12423583616, + "step": 23700 + }, + { + "epoch": 0.22657619518942962, + "grad_norm": 0.15262199938297272, + "learning_rate": 0.001, + "loss": 2.2545, + "num_input_tokens_seen": 12449797024, + "step": 23750 + }, + { + "epoch": 0.2270531977056179, + "grad_norm": 0.1370035856962204, + "learning_rate": 0.001, + "loss": 2.2558, + "num_input_tokens_seen": 12476011424, + "step": 23800 + }, + { + "epoch": 0.22753020022180617, + "grad_norm": 0.13982941210269928, + "learning_rate": 0.001, + "loss": 2.2334, + "num_input_tokens_seen": 12502223744, + "step": 23850 + }, + { + "epoch": 0.22800720273799444, + "grad_norm": 0.14523112773895264, + "learning_rate": 0.001, + "loss": 2.2536, + "num_input_tokens_seen": 12528433728, + "step": 23900 + }, + { + "epoch": 0.22848420525418273, + "grad_norm": 0.1419558823108673, + "learning_rate": 0.001, + "loss": 2.2568, + "num_input_tokens_seen": 12554642496, + "step": 23950 + }, + { + "epoch": 0.228961207770371, + "grad_norm": 0.1442372351884842, + "learning_rate": 0.001, + "loss": 2.2522, + "num_input_tokens_seen": 12580853504, + "step": 24000 + }, + { + "epoch": 0.228961207770371, + "eval_loss": 2.1732497215270996, + "eval_runtime": 82.253, + "eval_samples_per_second": 60.788, + "eval_steps_per_second": 15.197, + "num_input_tokens_seen": 12580853504, + "step": 24000 + }, + { + "epoch": 0.22943821028655925, + "grad_norm": 0.13844448328018188, + "learning_rate": 0.001, + "loss": 2.2602, + "num_input_tokens_seen": 12607066272, + "step": 24050 + }, + { + "epoch": 0.22991521280274754, + "grad_norm": 0.14124740660190582, + "learning_rate": 0.001, + "loss": 2.2533, + "num_input_tokens_seen": 12633279840, + "step": 24100 + }, + { + "epoch": 0.2303922153189358, + "grad_norm": 0.136307492852211, + "learning_rate": 0.001, + "loss": 2.2483, + "num_input_tokens_seen": 12659487104, + "step": 24150 + }, + { + "epoch": 0.23086921783512407, + "grad_norm": 0.13790194690227509, + "learning_rate": 0.001, + "loss": 2.2511, + "num_input_tokens_seen": 12685699648, + "step": 24200 + }, + { + "epoch": 0.23134622035131236, + "grad_norm": 0.13985110819339752, + "learning_rate": 0.001, + "loss": 2.2637, + "num_input_tokens_seen": 12711914048, + "step": 24250 + }, + { + "epoch": 0.23182322286750062, + "grad_norm": 0.14229442179203033, + "learning_rate": 0.001, + "loss": 2.2615, + "num_input_tokens_seen": 12738126208, + "step": 24300 + }, + { + "epoch": 0.23230022538368889, + "grad_norm": 0.13444297015666962, + "learning_rate": 0.001, + "loss": 2.2568, + "num_input_tokens_seen": 12764340608, + "step": 24350 + }, + { + "epoch": 0.23277722789987718, + "grad_norm": 0.14222408831119537, + "learning_rate": 0.001, + "loss": 2.2579, + "num_input_tokens_seen": 12790554080, + "step": 24400 + }, + { + "epoch": 0.23325423041606544, + "grad_norm": 0.14746561646461487, + "learning_rate": 0.001, + "loss": 2.2524, + "num_input_tokens_seen": 12816760928, + "step": 24450 + }, + { + "epoch": 0.23373123293225373, + "grad_norm": 0.14593298733234406, + "learning_rate": 0.001, + "loss": 2.2604, + "num_input_tokens_seen": 12842964128, + "step": 24500 + }, + { + "epoch": 0.23373123293225373, + "eval_loss": 2.171039342880249, + "eval_runtime": 82.0422, + "eval_samples_per_second": 60.944, + "eval_steps_per_second": 15.236, + "num_input_tokens_seen": 12842964128, + "step": 24500 + }, + { + "epoch": 0.234208235448442, + "grad_norm": 0.13651101291179657, + "learning_rate": 0.001, + "loss": 2.264, + "num_input_tokens_seen": 12869178432, + "step": 24550 + }, + { + "epoch": 0.23468523796463026, + "grad_norm": 0.15846236050128937, + "learning_rate": 0.001, + "loss": 2.2408, + "num_input_tokens_seen": 12895391296, + "step": 24600 + }, + { + "epoch": 0.23516224048081855, + "grad_norm": 0.16644498705863953, + "learning_rate": 0.001, + "loss": 2.2455, + "num_input_tokens_seen": 12921601472, + "step": 24650 + }, + { + "epoch": 0.2356392429970068, + "grad_norm": 0.14885085821151733, + "learning_rate": 0.001, + "loss": 2.261, + "num_input_tokens_seen": 12947811040, + "step": 24700 + }, + { + "epoch": 0.23611624551319507, + "grad_norm": 0.12761050462722778, + "learning_rate": 0.001, + "loss": 2.2534, + "num_input_tokens_seen": 12974014752, + "step": 24750 + }, + { + "epoch": 0.23659324802938336, + "grad_norm": 0.13764004409313202, + "learning_rate": 0.001, + "loss": 2.2592, + "num_input_tokens_seen": 13000226080, + "step": 24800 + }, + { + "epoch": 0.23707025054557163, + "grad_norm": 0.14264918863773346, + "learning_rate": 0.001, + "loss": 2.2482, + "num_input_tokens_seen": 13026440480, + "step": 24850 + }, + { + "epoch": 0.2375472530617599, + "grad_norm": 0.143757626414299, + "learning_rate": 0.001, + "loss": 2.2512, + "num_input_tokens_seen": 13052650624, + "step": 24900 + }, + { + "epoch": 0.23802425557794818, + "grad_norm": 0.157669335603714, + "learning_rate": 0.001, + "loss": 2.2597, + "num_input_tokens_seen": 13078857312, + "step": 24950 + }, + { + "epoch": 0.23850125809413644, + "grad_norm": 0.13242298364639282, + "learning_rate": 0.001, + "loss": 2.253, + "num_input_tokens_seen": 13105069824, + "step": 25000 + }, + { + "epoch": 0.23850125809413644, + "eval_loss": 2.1673171520233154, + "eval_runtime": 82.9202, + "eval_samples_per_second": 60.299, + "eval_steps_per_second": 15.075, + "num_input_tokens_seen": 13105069824, + "step": 25000 + }, + { + "epoch": 0.23897826061032473, + "grad_norm": 0.13656990230083466, + "learning_rate": 0.001, + "loss": 2.2661, + "num_input_tokens_seen": 13131284224, + "step": 25050 + }, + { + "epoch": 0.239455263126513, + "grad_norm": 0.13822485506534576, + "learning_rate": 0.001, + "loss": 2.251, + "num_input_tokens_seen": 13157497664, + "step": 25100 + }, + { + "epoch": 0.23993226564270126, + "grad_norm": 0.13563229143619537, + "learning_rate": 0.001, + "loss": 2.2495, + "num_input_tokens_seen": 13183708032, + "step": 25150 + }, + { + "epoch": 0.24040926815888955, + "grad_norm": 0.1263655722141266, + "learning_rate": 0.001, + "loss": 2.2478, + "num_input_tokens_seen": 13209920512, + "step": 25200 + }, + { + "epoch": 0.2408862706750778, + "grad_norm": 0.14311367273330688, + "learning_rate": 0.001, + "loss": 2.2444, + "num_input_tokens_seen": 13236127552, + "step": 25250 + }, + { + "epoch": 0.24136327319126608, + "grad_norm": 0.14571504294872284, + "learning_rate": 0.001, + "loss": 2.2587, + "num_input_tokens_seen": 13262329824, + "step": 25300 + }, + { + "epoch": 0.24184027570745437, + "grad_norm": 0.16660790145397186, + "learning_rate": 0.001, + "loss": 2.2554, + "num_input_tokens_seen": 13288536736, + "step": 25350 + }, + { + "epoch": 0.24231727822364263, + "grad_norm": 0.14656688272953033, + "learning_rate": 0.001, + "loss": 2.2505, + "num_input_tokens_seen": 13314751136, + "step": 25400 + }, + { + "epoch": 0.2427942807398309, + "grad_norm": 0.14772988855838776, + "learning_rate": 0.001, + "loss": 2.2451, + "num_input_tokens_seen": 13340963264, + "step": 25450 + }, + { + "epoch": 0.24327128325601918, + "grad_norm": 0.15227681398391724, + "learning_rate": 0.001, + "loss": 2.2388, + "num_input_tokens_seen": 13367175456, + "step": 25500 + }, + { + "epoch": 0.24327128325601918, + "eval_loss": 2.165367364883423, + "eval_runtime": 83.0673, + "eval_samples_per_second": 60.192, + "eval_steps_per_second": 15.048, + "num_input_tokens_seen": 13367175456, + "step": 25500 + }, + { + "epoch": 0.24374828577220745, + "grad_norm": 0.14786367118358612, + "learning_rate": 0.001, + "loss": 2.2416, + "num_input_tokens_seen": 13393384960, + "step": 25550 + }, + { + "epoch": 0.2442252882883957, + "grad_norm": 0.1325492560863495, + "learning_rate": 0.001, + "loss": 2.2425, + "num_input_tokens_seen": 13419599360, + "step": 25600 + }, + { + "epoch": 0.244702290804584, + "grad_norm": 0.14455124735832214, + "learning_rate": 0.001, + "loss": 2.235, + "num_input_tokens_seen": 13445800960, + "step": 25650 + }, + { + "epoch": 0.24517929332077226, + "grad_norm": 0.14452672004699707, + "learning_rate": 0.001, + "loss": 2.2436, + "num_input_tokens_seen": 13472012768, + "step": 25700 + }, + { + "epoch": 0.24565629583696055, + "grad_norm": 0.13266603648662567, + "learning_rate": 0.001, + "loss": 2.2406, + "num_input_tokens_seen": 13498221376, + "step": 25750 + }, + { + "epoch": 0.24613329835314882, + "grad_norm": 0.14916899800300598, + "learning_rate": 0.001, + "loss": 2.2432, + "num_input_tokens_seen": 13524435776, + "step": 25800 + }, + { + "epoch": 0.24661030086933708, + "grad_norm": 0.12612730264663696, + "learning_rate": 0.001, + "loss": 2.2415, + "num_input_tokens_seen": 13550647872, + "step": 25850 + }, + { + "epoch": 0.24708730338552537, + "grad_norm": 0.1731935441493988, + "learning_rate": 0.001, + "loss": 2.2427, + "num_input_tokens_seen": 13576851328, + "step": 25900 + }, + { + "epoch": 0.24756430590171363, + "grad_norm": 0.14770366251468658, + "learning_rate": 0.001, + "loss": 2.2543, + "num_input_tokens_seen": 13603058656, + "step": 25950 + }, + { + "epoch": 0.2480413084179019, + "grad_norm": 0.141856387257576, + "learning_rate": 0.001, + "loss": 2.2511, + "num_input_tokens_seen": 13629260960, + "step": 26000 + }, + { + "epoch": 0.2480413084179019, + "eval_loss": 2.1629014015197754, + "eval_runtime": 82.2057, + "eval_samples_per_second": 60.823, + "eval_steps_per_second": 15.206, + "num_input_tokens_seen": 13629260960, + "step": 26000 + }, + { + "epoch": 0.2485183109340902, + "grad_norm": 0.15201644599437714, + "learning_rate": 0.001, + "loss": 2.2435, + "num_input_tokens_seen": 13655473920, + "step": 26050 + }, + { + "epoch": 0.24899531345027845, + "grad_norm": 0.13137054443359375, + "learning_rate": 0.001, + "loss": 2.2433, + "num_input_tokens_seen": 13681688320, + "step": 26100 + }, + { + "epoch": 0.2494723159664667, + "grad_norm": 0.12851624190807343, + "learning_rate": 0.001, + "loss": 2.248, + "num_input_tokens_seen": 13707897792, + "step": 26150 + }, + { + "epoch": 0.249949318482655, + "grad_norm": 0.14121095836162567, + "learning_rate": 0.001, + "loss": 2.2385, + "num_input_tokens_seen": 13734107712, + "step": 26200 + }, + { + "epoch": 0.25042632099884327, + "grad_norm": 0.14826469123363495, + "learning_rate": 0.001, + "loss": 2.242, + "num_input_tokens_seen": 13760320736, + "step": 26250 + }, + { + "epoch": 0.25090332351503153, + "grad_norm": 0.1528318077325821, + "learning_rate": 0.001, + "loss": 2.2419, + "num_input_tokens_seen": 13786534208, + "step": 26300 + }, + { + "epoch": 0.2513803260312198, + "grad_norm": 0.15646971762180328, + "learning_rate": 0.001, + "loss": 2.2407, + "num_input_tokens_seen": 13812743072, + "step": 26350 + }, + { + "epoch": 0.2518573285474081, + "grad_norm": 0.13336172699928284, + "learning_rate": 0.001, + "loss": 2.2483, + "num_input_tokens_seen": 13838957088, + "step": 26400 + }, + { + "epoch": 0.2523343310635964, + "grad_norm": 0.13618668913841248, + "learning_rate": 0.001, + "loss": 2.2347, + "num_input_tokens_seen": 13865167008, + "step": 26450 + }, + { + "epoch": 0.25281133357978464, + "grad_norm": 0.14507335424423218, + "learning_rate": 0.001, + "loss": 2.2453, + "num_input_tokens_seen": 13891381408, + "step": 26500 + }, + { + "epoch": 0.25281133357978464, + "eval_loss": 2.159205675125122, + "eval_runtime": 82.4369, + "eval_samples_per_second": 60.652, + "eval_steps_per_second": 15.163, + "num_input_tokens_seen": 13891381408, + "step": 26500 + }, + { + "epoch": 0.2532883360959729, + "grad_norm": 0.15305021405220032, + "learning_rate": 0.001, + "loss": 2.2366, + "num_input_tokens_seen": 13917593408, + "step": 26550 + }, + { + "epoch": 0.25376533861216116, + "grad_norm": 0.1411104053258896, + "learning_rate": 0.001, + "loss": 2.237, + "num_input_tokens_seen": 13943805696, + "step": 26600 + }, + { + "epoch": 0.2542423411283494, + "grad_norm": 0.13980819284915924, + "learning_rate": 0.001, + "loss": 2.2511, + "num_input_tokens_seen": 13970020096, + "step": 26650 + }, + { + "epoch": 0.25471934364453774, + "grad_norm": 0.14256146550178528, + "learning_rate": 0.001, + "loss": 2.233, + "num_input_tokens_seen": 13996232832, + "step": 26700 + }, + { + "epoch": 0.255196346160726, + "grad_norm": 0.1522730439901352, + "learning_rate": 0.001, + "loss": 2.2433, + "num_input_tokens_seen": 14022447232, + "step": 26750 + }, + { + "epoch": 0.25567334867691427, + "grad_norm": 0.14082644879817963, + "learning_rate": 0.001, + "loss": 2.2481, + "num_input_tokens_seen": 14048655424, + "step": 26800 + }, + { + "epoch": 0.25615035119310253, + "grad_norm": 0.14439330995082855, + "learning_rate": 0.001, + "loss": 2.2418, + "num_input_tokens_seen": 14074868960, + "step": 26850 + }, + { + "epoch": 0.2566273537092908, + "grad_norm": 0.15122254192829132, + "learning_rate": 0.001, + "loss": 2.2237, + "num_input_tokens_seen": 14101083360, + "step": 26900 + }, + { + "epoch": 0.2571043562254791, + "grad_norm": 0.14002810418605804, + "learning_rate": 0.001, + "loss": 2.2446, + "num_input_tokens_seen": 14127297760, + "step": 26950 + }, + { + "epoch": 0.2575813587416674, + "grad_norm": 0.135335773229599, + "learning_rate": 0.001, + "loss": 2.2302, + "num_input_tokens_seen": 14153506688, + "step": 27000 + }, + { + "epoch": 0.2575813587416674, + "eval_loss": 2.1568057537078857, + "eval_runtime": 82.6565, + "eval_samples_per_second": 60.491, + "eval_steps_per_second": 15.123, + "num_input_tokens_seen": 14153506688, + "step": 27000 + }, + { + "epoch": 0.25805836125785564, + "grad_norm": 0.1621844619512558, + "learning_rate": 0.001, + "loss": 2.2378, + "num_input_tokens_seen": 14179709120, + "step": 27050 + }, + { + "epoch": 0.2585353637740439, + "grad_norm": 0.1400565207004547, + "learning_rate": 0.001, + "loss": 2.2416, + "num_input_tokens_seen": 14205917376, + "step": 27100 + }, + { + "epoch": 0.25901236629023217, + "grad_norm": 0.1439099758863449, + "learning_rate": 0.001, + "loss": 2.2443, + "num_input_tokens_seen": 14232130048, + "step": 27150 + }, + { + "epoch": 0.25948936880642043, + "grad_norm": 0.1417345255613327, + "learning_rate": 0.001, + "loss": 2.2416, + "num_input_tokens_seen": 14258342176, + "step": 27200 + }, + { + "epoch": 0.25996637132260875, + "grad_norm": 0.14503255486488342, + "learning_rate": 0.001, + "loss": 2.2363, + "num_input_tokens_seen": 14284552192, + "step": 27250 + }, + { + "epoch": 0.260443373838797, + "grad_norm": 0.14273668825626373, + "learning_rate": 0.001, + "loss": 2.2321, + "num_input_tokens_seen": 14310756928, + "step": 27300 + }, + { + "epoch": 0.2609203763549853, + "grad_norm": 0.14368127286434174, + "learning_rate": 0.001, + "loss": 2.2442, + "num_input_tokens_seen": 14336966336, + "step": 27350 + }, + { + "epoch": 0.26139737887117354, + "grad_norm": 0.13848727941513062, + "learning_rate": 0.001, + "loss": 2.2256, + "num_input_tokens_seen": 14363168832, + "step": 27400 + }, + { + "epoch": 0.2618743813873618, + "grad_norm": 0.13322456181049347, + "learning_rate": 0.001, + "loss": 2.2422, + "num_input_tokens_seen": 14389376864, + "step": 27450 + }, + { + "epoch": 0.2623513839035501, + "grad_norm": 0.14387381076812744, + "learning_rate": 0.001, + "loss": 2.2305, + "num_input_tokens_seen": 14415584288, + "step": 27500 + }, + { + "epoch": 0.2623513839035501, + "eval_loss": 2.1539554595947266, + "eval_runtime": 82.2364, + "eval_samples_per_second": 60.8, + "eval_steps_per_second": 15.2, + "num_input_tokens_seen": 14415584288, + "step": 27500 + }, + { + "epoch": 0.2628283864197384, + "grad_norm": 0.1330934762954712, + "learning_rate": 0.001, + "loss": 2.2403, + "num_input_tokens_seen": 14441793888, + "step": 27550 + }, + { + "epoch": 0.26330538893592664, + "grad_norm": 0.1436392068862915, + "learning_rate": 0.001, + "loss": 2.2378, + "num_input_tokens_seen": 14468003360, + "step": 27600 + }, + { + "epoch": 0.2637823914521149, + "grad_norm": 0.14959146082401276, + "learning_rate": 0.001, + "loss": 2.2322, + "num_input_tokens_seen": 14494214688, + "step": 27650 + }, + { + "epoch": 0.26425939396830317, + "grad_norm": 0.13732261955738068, + "learning_rate": 0.001, + "loss": 2.2409, + "num_input_tokens_seen": 14520428096, + "step": 27700 + }, + { + "epoch": 0.26473639648449143, + "grad_norm": 0.14365635812282562, + "learning_rate": 0.001, + "loss": 2.2309, + "num_input_tokens_seen": 14546632320, + "step": 27750 + }, + { + "epoch": 0.26521339900067975, + "grad_norm": 0.14219102263450623, + "learning_rate": 0.001, + "loss": 2.2301, + "num_input_tokens_seen": 14572845824, + "step": 27800 + }, + { + "epoch": 0.265690401516868, + "grad_norm": 0.12978766858577728, + "learning_rate": 0.001, + "loss": 2.2338, + "num_input_tokens_seen": 14599059104, + "step": 27850 + }, + { + "epoch": 0.2661674040330563, + "grad_norm": 0.14842823147773743, + "learning_rate": 0.001, + "loss": 2.2341, + "num_input_tokens_seen": 14625272352, + "step": 27900 + }, + { + "epoch": 0.26664440654924454, + "grad_norm": 0.15755997598171234, + "learning_rate": 0.001, + "loss": 2.2346, + "num_input_tokens_seen": 14651486752, + "step": 27950 + }, + { + "epoch": 0.2671214090654328, + "grad_norm": 0.15303048491477966, + "learning_rate": 0.001, + "loss": 2.2285, + "num_input_tokens_seen": 14677696896, + "step": 28000 + }, + { + "epoch": 0.2671214090654328, + "eval_loss": 2.1511332988739014, + "eval_runtime": 82.4707, + "eval_samples_per_second": 60.628, + "eval_steps_per_second": 15.157, + "num_input_tokens_seen": 14677696896, + "step": 28000 + }, + { + "epoch": 0.2675984115816211, + "grad_norm": 0.14884154498577118, + "learning_rate": 0.001, + "loss": 2.2273, + "num_input_tokens_seen": 14703910880, + "step": 28050 + }, + { + "epoch": 0.2680754140978094, + "grad_norm": 0.15423212945461273, + "learning_rate": 0.001, + "loss": 2.2328, + "num_input_tokens_seen": 14730125280, + "step": 28100 + }, + { + "epoch": 0.26855241661399765, + "grad_norm": 0.14974601566791534, + "learning_rate": 0.001, + "loss": 2.2282, + "num_input_tokens_seen": 14756332672, + "step": 28150 + }, + { + "epoch": 0.2690294191301859, + "grad_norm": 0.13820908963680267, + "learning_rate": 0.001, + "loss": 2.2281, + "num_input_tokens_seen": 14782539808, + "step": 28200 + }, + { + "epoch": 0.2695064216463742, + "grad_norm": 0.142080619931221, + "learning_rate": 0.001, + "loss": 2.2375, + "num_input_tokens_seen": 14808750880, + "step": 28250 + }, + { + "epoch": 0.26998342416256244, + "grad_norm": 0.12440051883459091, + "learning_rate": 0.001, + "loss": 2.2495, + "num_input_tokens_seen": 14834952704, + "step": 28300 + }, + { + "epoch": 0.27046042667875075, + "grad_norm": 0.1336458921432495, + "learning_rate": 0.001, + "loss": 2.2379, + "num_input_tokens_seen": 14861154080, + "step": 28350 + }, + { + "epoch": 0.270937429194939, + "grad_norm": 0.13392235338687897, + "learning_rate": 0.001, + "loss": 2.2309, + "num_input_tokens_seen": 14887355328, + "step": 28400 + }, + { + "epoch": 0.2714144317111273, + "grad_norm": 0.14661704003810883, + "learning_rate": 0.001, + "loss": 2.2339, + "num_input_tokens_seen": 14913569312, + "step": 28450 + }, + { + "epoch": 0.27189143422731554, + "grad_norm": 0.14747242629528046, + "learning_rate": 0.001, + "loss": 2.23, + "num_input_tokens_seen": 14939780320, + "step": 28500 + }, + { + "epoch": 0.27189143422731554, + "eval_loss": 2.1497859954833984, + "eval_runtime": 82.2811, + "eval_samples_per_second": 60.767, + "eval_steps_per_second": 15.192, + "num_input_tokens_seen": 14939780320, + "step": 28500 + }, + { + "epoch": 0.2723684367435038, + "grad_norm": 0.14154289662837982, + "learning_rate": 0.001, + "loss": 2.2335, + "num_input_tokens_seen": 14965993216, + "step": 28550 + }, + { + "epoch": 0.27284543925969207, + "grad_norm": 0.13375169038772583, + "learning_rate": 0.001, + "loss": 2.2336, + "num_input_tokens_seen": 14992207616, + "step": 28600 + }, + { + "epoch": 0.2733224417758804, + "grad_norm": 0.13693657517433167, + "learning_rate": 0.001, + "loss": 2.2406, + "num_input_tokens_seen": 15018421472, + "step": 28650 + }, + { + "epoch": 0.27379944429206865, + "grad_norm": 0.14101336896419525, + "learning_rate": 0.001, + "loss": 2.226, + "num_input_tokens_seen": 15044631584, + "step": 28700 + }, + { + "epoch": 0.2742764468082569, + "grad_norm": 0.1408209353685379, + "learning_rate": 0.001, + "loss": 2.2241, + "num_input_tokens_seen": 15070842272, + "step": 28750 + }, + { + "epoch": 0.2747534493244452, + "grad_norm": 0.13644535839557648, + "learning_rate": 0.001, + "loss": 2.234, + "num_input_tokens_seen": 15097056032, + "step": 28800 + }, + { + "epoch": 0.27523045184063344, + "grad_norm": 0.15270450711250305, + "learning_rate": 0.001, + "loss": 2.2408, + "num_input_tokens_seen": 15123268096, + "step": 28850 + }, + { + "epoch": 0.27570745435682176, + "grad_norm": 0.1514887809753418, + "learning_rate": 0.001, + "loss": 2.2306, + "num_input_tokens_seen": 15149474304, + "step": 28900 + }, + { + "epoch": 0.27618445687301, + "grad_norm": 0.13505037128925323, + "learning_rate": 0.001, + "loss": 2.2321, + "num_input_tokens_seen": 15175685536, + "step": 28950 + }, + { + "epoch": 0.2766614593891983, + "grad_norm": 0.1479782909154892, + "learning_rate": 0.001, + "loss": 2.2136, + "num_input_tokens_seen": 15201894176, + "step": 29000 + }, + { + "epoch": 0.2766614593891983, + "eval_loss": 2.147918701171875, + "eval_runtime": 82.3338, + "eval_samples_per_second": 60.728, + "eval_steps_per_second": 15.182, + "num_input_tokens_seen": 15201894176, + "step": 29000 + }, + { + "epoch": 0.27713846190538655, + "grad_norm": 0.15296803414821625, + "learning_rate": 0.001, + "loss": 2.2364, + "num_input_tokens_seen": 15228104928, + "step": 29050 + }, + { + "epoch": 0.2776154644215748, + "grad_norm": 0.13571250438690186, + "learning_rate": 0.001, + "loss": 2.2435, + "num_input_tokens_seen": 15254317056, + "step": 29100 + }, + { + "epoch": 0.27809246693776307, + "grad_norm": 0.13723242282867432, + "learning_rate": 0.001, + "loss": 2.229, + "num_input_tokens_seen": 15280525888, + "step": 29150 + }, + { + "epoch": 0.2785694694539514, + "grad_norm": 0.14391474425792694, + "learning_rate": 0.001, + "loss": 2.2304, + "num_input_tokens_seen": 15306733696, + "step": 29200 + }, + { + "epoch": 0.27904647197013965, + "grad_norm": 0.14517651498317719, + "learning_rate": 0.001, + "loss": 2.2281, + "num_input_tokens_seen": 15332940096, + "step": 29250 + }, + { + "epoch": 0.2795234744863279, + "grad_norm": 0.15248551964759827, + "learning_rate": 0.001, + "loss": 2.2329, + "num_input_tokens_seen": 15359147872, + "step": 29300 + }, + { + "epoch": 0.2800004770025162, + "grad_norm": 0.13358819484710693, + "learning_rate": 0.001, + "loss": 2.2321, + "num_input_tokens_seen": 15385357856, + "step": 29350 + }, + { + "epoch": 0.28047747951870444, + "grad_norm": 0.13603374361991882, + "learning_rate": 0.001, + "loss": 2.2251, + "num_input_tokens_seen": 15411568224, + "step": 29400 + }, + { + "epoch": 0.28095448203489276, + "grad_norm": 0.13729465007781982, + "learning_rate": 0.001, + "loss": 2.2238, + "num_input_tokens_seen": 15437777056, + "step": 29450 + }, + { + "epoch": 0.281431484551081, + "grad_norm": 0.14745451509952545, + "learning_rate": 0.001, + "loss": 2.2333, + "num_input_tokens_seen": 15463988928, + "step": 29500 + }, + { + "epoch": 0.281431484551081, + "eval_loss": 2.1445696353912354, + "eval_runtime": 82.7914, + "eval_samples_per_second": 60.393, + "eval_steps_per_second": 15.098, + "num_input_tokens_seen": 15463988928, + "step": 29500 + }, + { + "epoch": 0.2819084870672693, + "grad_norm": 0.1480429619550705, + "learning_rate": 0.001, + "loss": 2.2247, + "num_input_tokens_seen": 15490198688, + "step": 29550 + }, + { + "epoch": 0.28238548958345755, + "grad_norm": 0.1438407003879547, + "learning_rate": 0.001, + "loss": 2.2394, + "num_input_tokens_seen": 15516405824, + "step": 29600 + }, + { + "epoch": 0.2828624920996458, + "grad_norm": 0.14721985161304474, + "learning_rate": 0.001, + "loss": 2.2281, + "num_input_tokens_seen": 15542610624, + "step": 29650 + }, + { + "epoch": 0.2833394946158341, + "grad_norm": 0.17293605208396912, + "learning_rate": 0.001, + "loss": 2.2306, + "num_input_tokens_seen": 15568821408, + "step": 29700 + }, + { + "epoch": 0.2838164971320224, + "grad_norm": 0.14340583980083466, + "learning_rate": 0.001, + "loss": 2.2249, + "num_input_tokens_seen": 15595031840, + "step": 29750 + }, + { + "epoch": 0.28429349964821066, + "grad_norm": 0.14480094611644745, + "learning_rate": 0.001, + "loss": 2.2238, + "num_input_tokens_seen": 15621240512, + "step": 29800 + }, + { + "epoch": 0.2847705021643989, + "grad_norm": 0.13383765518665314, + "learning_rate": 0.001, + "loss": 2.2129, + "num_input_tokens_seen": 15647444192, + "step": 29850 + }, + { + "epoch": 0.2852475046805872, + "grad_norm": 0.1253250688314438, + "learning_rate": 0.001, + "loss": 2.2237, + "num_input_tokens_seen": 15673657408, + "step": 29900 + }, + { + "epoch": 0.28572450719677545, + "grad_norm": 0.14244495332241058, + "learning_rate": 0.001, + "loss": 2.2189, + "num_input_tokens_seen": 15699867008, + "step": 29950 + }, + { + "epoch": 0.2862015097129637, + "grad_norm": 0.14013972878456116, + "learning_rate": 0.001, + "loss": 2.2241, + "num_input_tokens_seen": 15726072896, + "step": 30000 + }, + { + "epoch": 0.2862015097129637, + "eval_loss": 2.142565965652466, + "eval_runtime": 82.4636, + "eval_samples_per_second": 60.633, + "eval_steps_per_second": 15.158, + "num_input_tokens_seen": 15726072896, + "step": 30000 + }, + { + "epoch": 0.286678512229152, + "grad_norm": 0.1321924477815628, + "learning_rate": 0.001, + "loss": 2.2292, + "num_input_tokens_seen": 15752287296, + "step": 30050 + }, + { + "epoch": 0.2871555147453403, + "grad_norm": 0.13304558396339417, + "learning_rate": 0.001, + "loss": 2.2254, + "num_input_tokens_seen": 15778491232, + "step": 30100 + }, + { + "epoch": 0.28763251726152855, + "grad_norm": 0.146531879901886, + "learning_rate": 0.001, + "loss": 2.231, + "num_input_tokens_seen": 15804705632, + "step": 30150 + }, + { + "epoch": 0.2881095197777168, + "grad_norm": 0.14188075065612793, + "learning_rate": 0.001, + "loss": 2.2163, + "num_input_tokens_seen": 15830915584, + "step": 30200 + }, + { + "epoch": 0.2885865222939051, + "grad_norm": 0.1407197266817093, + "learning_rate": 0.001, + "loss": 2.2344, + "num_input_tokens_seen": 15857128832, + "step": 30250 + }, + { + "epoch": 0.2890635248100934, + "grad_norm": 0.137710839509964, + "learning_rate": 0.001, + "loss": 2.2233, + "num_input_tokens_seen": 15883341600, + "step": 30300 + }, + { + "epoch": 0.28954052732628166, + "grad_norm": 0.15242904424667358, + "learning_rate": 0.001, + "loss": 2.2306, + "num_input_tokens_seen": 15909551680, + "step": 30350 + }, + { + "epoch": 0.2900175298424699, + "grad_norm": 0.1370503157377243, + "learning_rate": 0.001, + "loss": 2.2259, + "num_input_tokens_seen": 15935762624, + "step": 30400 + }, + { + "epoch": 0.2904945323586582, + "grad_norm": 0.14922258257865906, + "learning_rate": 0.001, + "loss": 2.2201, + "num_input_tokens_seen": 15961972960, + "step": 30450 + }, + { + "epoch": 0.29097153487484645, + "grad_norm": 0.15227100253105164, + "learning_rate": 0.001, + "loss": 2.2318, + "num_input_tokens_seen": 15988180544, + "step": 30500 + }, + { + "epoch": 0.29097153487484645, + "eval_loss": 2.1403589248657227, + "eval_runtime": 82.3532, + "eval_samples_per_second": 60.714, + "eval_steps_per_second": 15.179, + "num_input_tokens_seen": 15988180544, + "step": 30500 + }, + { + "epoch": 0.2914485373910347, + "grad_norm": 0.14497631788253784, + "learning_rate": 0.001, + "loss": 2.2362, + "num_input_tokens_seen": 16014385184, + "step": 30550 + }, + { + "epoch": 0.29192553990722303, + "grad_norm": 0.16184133291244507, + "learning_rate": 0.001, + "loss": 2.2189, + "num_input_tokens_seen": 16040596480, + "step": 30600 + }, + { + "epoch": 0.2924025424234113, + "grad_norm": 0.1632627546787262, + "learning_rate": 0.001, + "loss": 2.2354, + "num_input_tokens_seen": 16066805632, + "step": 30650 + }, + { + "epoch": 0.29287954493959956, + "grad_norm": 0.13771861791610718, + "learning_rate": 0.001, + "loss": 2.2261, + "num_input_tokens_seen": 16093020000, + "step": 30700 + }, + { + "epoch": 0.2933565474557878, + "grad_norm": 0.13185714185237885, + "learning_rate": 0.001, + "loss": 2.2269, + "num_input_tokens_seen": 16119232416, + "step": 30750 + }, + { + "epoch": 0.2938335499719761, + "grad_norm": 0.1477263867855072, + "learning_rate": 0.001, + "loss": 2.2512, + "num_input_tokens_seen": 16145442176, + "step": 30800 + }, + { + "epoch": 0.2943105524881644, + "grad_norm": 0.13301660120487213, + "learning_rate": 0.001, + "loss": 2.2361, + "num_input_tokens_seen": 16171653888, + "step": 30850 + }, + { + "epoch": 0.29478755500435266, + "grad_norm": 0.1365855634212494, + "learning_rate": 0.001, + "loss": 2.2314, + "num_input_tokens_seen": 16197866368, + "step": 30900 + }, + { + "epoch": 0.2952645575205409, + "grad_norm": 0.13683389127254486, + "learning_rate": 0.001, + "loss": 2.2174, + "num_input_tokens_seen": 16224078272, + "step": 30950 + }, + { + "epoch": 0.2957415600367292, + "grad_norm": 0.14148303866386414, + "learning_rate": 0.001, + "loss": 2.2156, + "num_input_tokens_seen": 16250292672, + "step": 31000 + }, + { + "epoch": 0.2957415600367292, + "eval_loss": 2.1403136253356934, + "eval_runtime": 82.1466, + "eval_samples_per_second": 60.867, + "eval_steps_per_second": 15.217, + "num_input_tokens_seen": 16250292672, + "step": 31000 + }, + { + "epoch": 0.29621856255291745, + "grad_norm": 0.13794481754302979, + "learning_rate": 0.001, + "loss": 2.2231, + "num_input_tokens_seen": 16276495104, + "step": 31050 + }, + { + "epoch": 0.2966955650691057, + "grad_norm": 0.13724444806575775, + "learning_rate": 0.001, + "loss": 2.2262, + "num_input_tokens_seen": 16302705760, + "step": 31100 + }, + { + "epoch": 0.29717256758529403, + "grad_norm": 0.14742279052734375, + "learning_rate": 0.001, + "loss": 2.2117, + "num_input_tokens_seen": 16328920160, + "step": 31150 + }, + { + "epoch": 0.2976495701014823, + "grad_norm": 0.15125079452991486, + "learning_rate": 0.001, + "loss": 2.23, + "num_input_tokens_seen": 16355134560, + "step": 31200 + }, + { + "epoch": 0.29812657261767056, + "grad_norm": 0.13968023657798767, + "learning_rate": 0.001, + "loss": 2.2199, + "num_input_tokens_seen": 16381347072, + "step": 31250 + }, + { + "epoch": 0.2986035751338588, + "grad_norm": 0.1456058770418167, + "learning_rate": 0.001, + "loss": 2.2239, + "num_input_tokens_seen": 16407551584, + "step": 31300 + }, + { + "epoch": 0.2990805776500471, + "grad_norm": 0.1414702981710434, + "learning_rate": 0.001, + "loss": 2.2312, + "num_input_tokens_seen": 16433761632, + "step": 31350 + }, + { + "epoch": 0.2995575801662354, + "grad_norm": 0.14494173228740692, + "learning_rate": 0.001, + "loss": 2.2159, + "num_input_tokens_seen": 16459971584, + "step": 31400 + }, + { + "epoch": 0.30003458268242367, + "grad_norm": 0.1383238434791565, + "learning_rate": 0.001, + "loss": 2.2192, + "num_input_tokens_seen": 16486183552, + "step": 31450 + }, + { + "epoch": 0.30051158519861193, + "grad_norm": 0.1489211916923523, + "learning_rate": 0.001, + "loss": 2.2184, + "num_input_tokens_seen": 16512390176, + "step": 31500 + }, + { + "epoch": 0.30051158519861193, + "eval_loss": 2.137352466583252, + "eval_runtime": 82.2794, + "eval_samples_per_second": 60.769, + "eval_steps_per_second": 15.192, + "num_input_tokens_seen": 16512390176, + "step": 31500 + }, + { + "epoch": 0.3009885877148002, + "grad_norm": 0.13963983952999115, + "learning_rate": 0.001, + "loss": 2.2169, + "num_input_tokens_seen": 16538590176, + "step": 31550 + }, + { + "epoch": 0.30146559023098846, + "grad_norm": 0.1359083205461502, + "learning_rate": 0.001, + "loss": 2.2186, + "num_input_tokens_seen": 16564803488, + "step": 31600 + }, + { + "epoch": 0.3019425927471767, + "grad_norm": 0.14997832477092743, + "learning_rate": 0.001, + "loss": 2.2268, + "num_input_tokens_seen": 16591011264, + "step": 31650 + }, + { + "epoch": 0.30241959526336504, + "grad_norm": 0.15025894343852997, + "learning_rate": 0.001, + "loss": 2.2057, + "num_input_tokens_seen": 16617204384, + "step": 31700 + }, + { + "epoch": 0.3028965977795533, + "grad_norm": 0.15080207586288452, + "learning_rate": 0.001, + "loss": 2.2183, + "num_input_tokens_seen": 16643416384, + "step": 31750 + }, + { + "epoch": 0.30337360029574156, + "grad_norm": 0.14180131256580353, + "learning_rate": 0.001, + "loss": 2.2197, + "num_input_tokens_seen": 16669626720, + "step": 31800 + }, + { + "epoch": 0.3038506028119298, + "grad_norm": 0.13369297981262207, + "learning_rate": 0.001, + "loss": 2.2204, + "num_input_tokens_seen": 16695839904, + "step": 31850 + }, + { + "epoch": 0.3043276053281181, + "grad_norm": 0.12933358550071716, + "learning_rate": 0.001, + "loss": 2.2146, + "num_input_tokens_seen": 16722049312, + "step": 31900 + }, + { + "epoch": 0.30480460784430635, + "grad_norm": 0.14158771932125092, + "learning_rate": 0.001, + "loss": 2.2189, + "num_input_tokens_seen": 16748263712, + "step": 31950 + }, + { + "epoch": 0.30528161036049467, + "grad_norm": 0.13659563660621643, + "learning_rate": 0.001, + "loss": 2.2261, + "num_input_tokens_seen": 16774478112, + "step": 32000 + }, + { + "epoch": 0.30528161036049467, + "eval_loss": 2.134640693664551, + "eval_runtime": 82.2448, + "eval_samples_per_second": 60.794, + "eval_steps_per_second": 15.199, + "num_input_tokens_seen": 16774478112, + "step": 32000 + }, + { + "epoch": 0.30575861287668293, + "grad_norm": 0.1510363072156906, + "learning_rate": 0.001, + "loss": 2.2115, + "num_input_tokens_seen": 16800686624, + "step": 32050 + }, + { + "epoch": 0.3062356153928712, + "grad_norm": 0.13621051609516144, + "learning_rate": 0.001, + "loss": 2.2137, + "num_input_tokens_seen": 16826897024, + "step": 32100 + }, + { + "epoch": 0.30671261790905946, + "grad_norm": 0.15500463545322418, + "learning_rate": 0.001, + "loss": 2.2202, + "num_input_tokens_seen": 16853103904, + "step": 32150 + }, + { + "epoch": 0.3071896204252477, + "grad_norm": 0.17024828493595123, + "learning_rate": 0.001, + "loss": 2.2153, + "num_input_tokens_seen": 16879308320, + "step": 32200 + }, + { + "epoch": 0.30766662294143604, + "grad_norm": 0.14913325011730194, + "learning_rate": 0.001, + "loss": 2.2106, + "num_input_tokens_seen": 16905522592, + "step": 32250 + }, + { + "epoch": 0.3081436254576243, + "grad_norm": 0.1486227959394455, + "learning_rate": 0.001, + "loss": 2.2141, + "num_input_tokens_seen": 16931727424, + "step": 32300 + }, + { + "epoch": 0.30862062797381257, + "grad_norm": 0.1393032670021057, + "learning_rate": 0.001, + "loss": 2.2127, + "num_input_tokens_seen": 16957941504, + "step": 32350 + }, + { + "epoch": 0.30909763049000083, + "grad_norm": 0.13792607188224792, + "learning_rate": 0.001, + "loss": 2.2021, + "num_input_tokens_seen": 16984153952, + "step": 32400 + }, + { + "epoch": 0.3095746330061891, + "grad_norm": 0.14202407002449036, + "learning_rate": 0.001, + "loss": 2.2192, + "num_input_tokens_seen": 17010368352, + "step": 32450 + }, + { + "epoch": 0.31005163552237736, + "grad_norm": 0.15250712633132935, + "learning_rate": 0.001, + "loss": 2.2091, + "num_input_tokens_seen": 17036582752, + "step": 32500 + }, + { + "epoch": 0.31005163552237736, + "eval_loss": 2.1334831714630127, + "eval_runtime": 82.4889, + "eval_samples_per_second": 60.614, + "eval_steps_per_second": 15.154, + "num_input_tokens_seen": 17036582752, + "step": 32500 + }, + { + "epoch": 0.3105286380385657, + "grad_norm": 0.15931129455566406, + "learning_rate": 0.001, + "loss": 2.2171, + "num_input_tokens_seen": 17062797152, + "step": 32550 + }, + { + "epoch": 0.31100564055475394, + "grad_norm": 0.1495935022830963, + "learning_rate": 0.001, + "loss": 2.2178, + "num_input_tokens_seen": 17089009728, + "step": 32600 + }, + { + "epoch": 0.3114826430709422, + "grad_norm": 0.1444777250289917, + "learning_rate": 0.001, + "loss": 2.2206, + "num_input_tokens_seen": 17115219360, + "step": 32650 + }, + { + "epoch": 0.31195964558713046, + "grad_norm": 0.13968896865844727, + "learning_rate": 0.001, + "loss": 2.213, + "num_input_tokens_seen": 17141432480, + "step": 32700 + }, + { + "epoch": 0.3124366481033187, + "grad_norm": 0.14426162838935852, + "learning_rate": 0.001, + "loss": 2.2091, + "num_input_tokens_seen": 17167643776, + "step": 32750 + }, + { + "epoch": 0.31291365061950704, + "grad_norm": 0.15707091987133026, + "learning_rate": 0.001, + "loss": 2.219, + "num_input_tokens_seen": 17193858176, + "step": 32800 + }, + { + "epoch": 0.3133906531356953, + "grad_norm": 0.14893439412117004, + "learning_rate": 0.001, + "loss": 2.2213, + "num_input_tokens_seen": 17220072576, + "step": 32850 + }, + { + "epoch": 0.31386765565188357, + "grad_norm": 0.15472280979156494, + "learning_rate": 0.001, + "loss": 2.2195, + "num_input_tokens_seen": 17246283712, + "step": 32900 + }, + { + "epoch": 0.31434465816807183, + "grad_norm": 0.13622242212295532, + "learning_rate": 0.001, + "loss": 2.2008, + "num_input_tokens_seen": 17272492064, + "step": 32950 + }, + { + "epoch": 0.3148216606842601, + "grad_norm": 0.14335715770721436, + "learning_rate": 0.001, + "loss": 2.2222, + "num_input_tokens_seen": 17298706464, + "step": 33000 + }, + { + "epoch": 0.3148216606842601, + "eval_loss": 2.130917549133301, + "eval_runtime": 81.5143, + "eval_samples_per_second": 61.339, + "eval_steps_per_second": 15.335, + "num_input_tokens_seen": 17298706464, + "step": 33000 + }, + { + "epoch": 0.31529866320044836, + "grad_norm": 0.138872429728508, + "learning_rate": 0.001, + "loss": 2.2158, + "num_input_tokens_seen": 17324910368, + "step": 33050 + }, + { + "epoch": 0.3157756657166367, + "grad_norm": 0.1603110432624817, + "learning_rate": 0.001, + "loss": 2.2118, + "num_input_tokens_seen": 17351124768, + "step": 33100 + }, + { + "epoch": 0.31625266823282494, + "grad_norm": 0.15042147040367126, + "learning_rate": 0.001, + "loss": 2.218, + "num_input_tokens_seen": 17377337600, + "step": 33150 + }, + { + "epoch": 0.3167296707490132, + "grad_norm": 0.13590936362743378, + "learning_rate": 0.001, + "loss": 2.2239, + "num_input_tokens_seen": 17403547008, + "step": 33200 + }, + { + "epoch": 0.31720667326520147, + "grad_norm": 0.14794215559959412, + "learning_rate": 0.001, + "loss": 2.2039, + "num_input_tokens_seen": 17429759456, + "step": 33250 + }, + { + "epoch": 0.31768367578138973, + "grad_norm": 0.15067002177238464, + "learning_rate": 0.001, + "loss": 2.2089, + "num_input_tokens_seen": 17455966240, + "step": 33300 + }, + { + "epoch": 0.318160678297578, + "grad_norm": 0.13929149508476257, + "learning_rate": 0.001, + "loss": 2.2056, + "num_input_tokens_seen": 17482177952, + "step": 33350 + }, + { + "epoch": 0.3186376808137663, + "grad_norm": 0.13707947731018066, + "learning_rate": 0.001, + "loss": 2.2194, + "num_input_tokens_seen": 17508390848, + "step": 33400 + }, + { + "epoch": 0.3191146833299546, + "grad_norm": 0.13600605726242065, + "learning_rate": 0.001, + "loss": 2.2219, + "num_input_tokens_seen": 17534602528, + "step": 33450 + }, + { + "epoch": 0.31959168584614284, + "grad_norm": 0.16074150800704956, + "learning_rate": 0.001, + "loss": 2.2172, + "num_input_tokens_seen": 17560813216, + "step": 33500 + }, + { + "epoch": 0.31959168584614284, + "eval_loss": 2.1288042068481445, + "eval_runtime": 82.01, + "eval_samples_per_second": 60.968, + "eval_steps_per_second": 15.242, + "num_input_tokens_seen": 17560813216, + "step": 33500 + }, + { + "epoch": 0.3200686883623311, + "grad_norm": 0.1537347286939621, + "learning_rate": 0.001, + "loss": 2.2128, + "num_input_tokens_seen": 17587027616, + "step": 33550 + }, + { + "epoch": 0.32054569087851936, + "grad_norm": 0.15225750207901, + "learning_rate": 0.001, + "loss": 2.2182, + "num_input_tokens_seen": 17613240704, + "step": 33600 + }, + { + "epoch": 0.3210226933947077, + "grad_norm": 0.17453214526176453, + "learning_rate": 0.001, + "loss": 2.2089, + "num_input_tokens_seen": 17639452224, + "step": 33650 + }, + { + "epoch": 0.32149969591089594, + "grad_norm": 0.13869380950927734, + "learning_rate": 0.001, + "loss": 2.2164, + "num_input_tokens_seen": 17665664864, + "step": 33700 + }, + { + "epoch": 0.3219766984270842, + "grad_norm": 0.1399545818567276, + "learning_rate": 0.001, + "loss": 2.2066, + "num_input_tokens_seen": 17691877472, + "step": 33750 + }, + { + "epoch": 0.32245370094327247, + "grad_norm": 0.15858028829097748, + "learning_rate": 0.001, + "loss": 2.2062, + "num_input_tokens_seen": 17718076992, + "step": 33800 + }, + { + "epoch": 0.32293070345946073, + "grad_norm": 0.14098668098449707, + "learning_rate": 0.001, + "loss": 2.2124, + "num_input_tokens_seen": 17744290080, + "step": 33850 + }, + { + "epoch": 0.323407705975649, + "grad_norm": 0.15374623239040375, + "learning_rate": 0.001, + "loss": 2.2017, + "num_input_tokens_seen": 17770502944, + "step": 33900 + }, + { + "epoch": 0.3238847084918373, + "grad_norm": 0.15962567925453186, + "learning_rate": 0.001, + "loss": 2.2095, + "num_input_tokens_seen": 17796710240, + "step": 33950 + }, + { + "epoch": 0.3243617110080256, + "grad_norm": 0.14005784690380096, + "learning_rate": 0.001, + "loss": 2.2003, + "num_input_tokens_seen": 17822921152, + "step": 34000 + }, + { + "epoch": 0.3243617110080256, + "eval_loss": 2.127488613128662, + "eval_runtime": 82.0715, + "eval_samples_per_second": 60.923, + "eval_steps_per_second": 15.231, + "num_input_tokens_seen": 17822921152, + "step": 34000 + }, + { + "epoch": 0.32483871352421384, + "grad_norm": 0.15280354022979736, + "learning_rate": 0.001, + "loss": 2.2068, + "num_input_tokens_seen": 17849135552, + "step": 34050 + }, + { + "epoch": 0.3253157160404021, + "grad_norm": 0.13673514127731323, + "learning_rate": 0.001, + "loss": 2.1972, + "num_input_tokens_seen": 17875349632, + "step": 34100 + }, + { + "epoch": 0.32579271855659037, + "grad_norm": 0.13414499163627625, + "learning_rate": 0.001, + "loss": 2.2037, + "num_input_tokens_seen": 17901564032, + "step": 34150 + }, + { + "epoch": 0.3262697210727787, + "grad_norm": 0.16514766216278076, + "learning_rate": 0.001, + "loss": 2.2569, + "num_input_tokens_seen": 17927772512, + "step": 34200 + }, + { + "epoch": 0.32674672358896695, + "grad_norm": 0.14111830294132233, + "learning_rate": 0.001, + "loss": 2.2395, + "num_input_tokens_seen": 17953984416, + "step": 34250 + }, + { + "epoch": 0.3272237261051552, + "grad_norm": 0.1439828723669052, + "learning_rate": 0.001, + "loss": 2.2155, + "num_input_tokens_seen": 17980194464, + "step": 34300 + }, + { + "epoch": 0.3277007286213435, + "grad_norm": 0.13390502333641052, + "learning_rate": 0.001, + "loss": 2.2226, + "num_input_tokens_seen": 18006404832, + "step": 34350 + }, + { + "epoch": 0.32817773113753174, + "grad_norm": 0.1647501289844513, + "learning_rate": 0.001, + "loss": 2.2113, + "num_input_tokens_seen": 18032606304, + "step": 34400 + }, + { + "epoch": 0.32865473365372, + "grad_norm": 0.14238382875919342, + "learning_rate": 0.001, + "loss": 2.219, + "num_input_tokens_seen": 18058813856, + "step": 34450 + }, + { + "epoch": 0.3291317361699083, + "grad_norm": 0.13778063654899597, + "learning_rate": 0.001, + "loss": 2.2152, + "num_input_tokens_seen": 18085018528, + "step": 34500 + }, + { + "epoch": 0.3291317361699083, + "eval_loss": 2.1281445026397705, + "eval_runtime": 81.5943, + "eval_samples_per_second": 61.279, + "eval_steps_per_second": 15.32, + "num_input_tokens_seen": 18085018528, + "step": 34500 + }, + { + "epoch": 0.3296087386860966, + "grad_norm": 0.144911527633667, + "learning_rate": 0.001, + "loss": 2.2176, + "num_input_tokens_seen": 18111225184, + "step": 34550 + }, + { + "epoch": 0.33008574120228484, + "grad_norm": 0.1375516951084137, + "learning_rate": 0.001, + "loss": 2.2034, + "num_input_tokens_seen": 18137439584, + "step": 34600 + }, + { + "epoch": 0.3305627437184731, + "grad_norm": 0.1483387053012848, + "learning_rate": 0.001, + "loss": 2.2139, + "num_input_tokens_seen": 18163649536, + "step": 34650 + }, + { + "epoch": 0.33103974623466137, + "grad_norm": 0.13756632804870605, + "learning_rate": 0.001, + "loss": 2.2142, + "num_input_tokens_seen": 18189863936, + "step": 34700 + }, + { + "epoch": 0.3315167487508497, + "grad_norm": 0.15277694165706635, + "learning_rate": 0.001, + "loss": 2.2, + "num_input_tokens_seen": 18216078336, + "step": 34750 + }, + { + "epoch": 0.33199375126703795, + "grad_norm": 0.1554708331823349, + "learning_rate": 0.001, + "loss": 2.1976, + "num_input_tokens_seen": 18242291840, + "step": 34800 + }, + { + "epoch": 0.3324707537832262, + "grad_norm": 0.15377846360206604, + "learning_rate": 0.001, + "loss": 2.2152, + "num_input_tokens_seen": 18268506240, + "step": 34850 + }, + { + "epoch": 0.3329477562994145, + "grad_norm": 0.14748121798038483, + "learning_rate": 0.001, + "loss": 2.2081, + "num_input_tokens_seen": 18294720640, + "step": 34900 + }, + { + "epoch": 0.33342475881560274, + "grad_norm": 0.13819515705108643, + "learning_rate": 0.001, + "loss": 2.2003, + "num_input_tokens_seen": 18320932768, + "step": 34950 + }, + { + "epoch": 0.333901761331791, + "grad_norm": 0.15223775804042816, + "learning_rate": 0.001, + "loss": 2.2076, + "num_input_tokens_seen": 18347147168, + "step": 35000 + }, + { + "epoch": 0.333901761331791, + "eval_loss": 2.1236226558685303, + "eval_runtime": 82.2565, + "eval_samples_per_second": 60.785, + "eval_steps_per_second": 15.196, + "num_input_tokens_seen": 18347147168, + "step": 35000 + }, + { + "epoch": 0.3343787638479793, + "grad_norm": 0.2939516603946686, + "learning_rate": 0.001, + "loss": 2.1954, + "num_input_tokens_seen": 18373355744, + "step": 35050 + }, + { + "epoch": 0.3348557663641676, + "grad_norm": 0.1514858454465866, + "learning_rate": 0.001, + "loss": 2.1988, + "num_input_tokens_seen": 18399565696, + "step": 35100 + }, + { + "epoch": 0.33533276888035585, + "grad_norm": 0.1336488127708435, + "learning_rate": 0.001, + "loss": 2.2029, + "num_input_tokens_seen": 18425766080, + "step": 35150 + }, + { + "epoch": 0.3358097713965441, + "grad_norm": 0.14635372161865234, + "learning_rate": 0.001, + "loss": 2.2098, + "num_input_tokens_seen": 18451970176, + "step": 35200 + }, + { + "epoch": 0.3362867739127324, + "grad_norm": 0.13536489009857178, + "learning_rate": 0.001, + "loss": 2.2025, + "num_input_tokens_seen": 18478179808, + "step": 35250 + }, + { + "epoch": 0.33676377642892064, + "grad_norm": 0.13300848007202148, + "learning_rate": 0.001, + "loss": 2.2052, + "num_input_tokens_seen": 18504390624, + "step": 35300 + }, + { + "epoch": 0.33724077894510895, + "grad_norm": 0.1379849761724472, + "learning_rate": 0.001, + "loss": 2.2107, + "num_input_tokens_seen": 18530605024, + "step": 35350 + }, + { + "epoch": 0.3377177814612972, + "grad_norm": 0.14962108433246613, + "learning_rate": 0.001, + "loss": 2.2162, + "num_input_tokens_seen": 18556814720, + "step": 35400 + }, + { + "epoch": 0.3381947839774855, + "grad_norm": 0.13894064724445343, + "learning_rate": 0.001, + "loss": 2.2014, + "num_input_tokens_seen": 18583020256, + "step": 35450 + }, + { + "epoch": 0.33867178649367374, + "grad_norm": 0.13783761858940125, + "learning_rate": 0.001, + "loss": 2.2005, + "num_input_tokens_seen": 18609222656, + "step": 35500 + }, + { + "epoch": 0.33867178649367374, + "eval_loss": 2.1224164962768555, + "eval_runtime": 82.5781, + "eval_samples_per_second": 60.549, + "eval_steps_per_second": 15.137, + "num_input_tokens_seen": 18609222656, + "step": 35500 + }, + { + "epoch": 0.339148789009862, + "grad_norm": 0.1444152593612671, + "learning_rate": 0.001, + "loss": 2.2034, + "num_input_tokens_seen": 18635432384, + "step": 35550 + }, + { + "epoch": 0.3396257915260503, + "grad_norm": 0.1575513333082199, + "learning_rate": 0.001, + "loss": 2.2109, + "num_input_tokens_seen": 18661645568, + "step": 35600 + }, + { + "epoch": 0.3401027940422386, + "grad_norm": 0.138763889670372, + "learning_rate": 0.001, + "loss": 2.1982, + "num_input_tokens_seen": 18687859968, + "step": 35650 + }, + { + "epoch": 0.34057979655842685, + "grad_norm": 0.16039063036441803, + "learning_rate": 0.001, + "loss": 2.2078, + "num_input_tokens_seen": 18714071840, + "step": 35700 + }, + { + "epoch": 0.3410567990746151, + "grad_norm": 0.1333894282579422, + "learning_rate": 0.001, + "loss": 2.2066, + "num_input_tokens_seen": 18740277952, + "step": 35750 + }, + { + "epoch": 0.3415338015908034, + "grad_norm": 0.15125052630901337, + "learning_rate": 0.001, + "loss": 2.2071, + "num_input_tokens_seen": 18766492352, + "step": 35800 + }, + { + "epoch": 0.34201080410699164, + "grad_norm": 0.13908621668815613, + "learning_rate": 0.001, + "loss": 2.1832, + "num_input_tokens_seen": 18792706752, + "step": 35850 + }, + { + "epoch": 0.34248780662317996, + "grad_norm": 0.1455029994249344, + "learning_rate": 0.001, + "loss": 2.2048, + "num_input_tokens_seen": 18818913376, + "step": 35900 + }, + { + "epoch": 0.3429648091393682, + "grad_norm": 0.14915207028388977, + "learning_rate": 0.001, + "loss": 2.2016, + "num_input_tokens_seen": 18845127776, + "step": 35950 + }, + { + "epoch": 0.3434418116555565, + "grad_norm": 0.15712140500545502, + "learning_rate": 0.001, + "loss": 2.1975, + "num_input_tokens_seen": 18871341760, + "step": 36000 + }, + { + "epoch": 0.3434418116555565, + "eval_loss": 2.119873523712158, + "eval_runtime": 82.5855, + "eval_samples_per_second": 60.543, + "eval_steps_per_second": 15.136, + "num_input_tokens_seen": 18871341760, + "step": 36000 + }, + { + "epoch": 0.34391881417174475, + "grad_norm": 0.12918758392333984, + "learning_rate": 0.001, + "loss": 2.2082, + "num_input_tokens_seen": 18897547296, + "step": 36050 + }, + { + "epoch": 0.344395816687933, + "grad_norm": 0.16404707729816437, + "learning_rate": 0.001, + "loss": 2.2093, + "num_input_tokens_seen": 18923754720, + "step": 36100 + }, + { + "epoch": 0.34487281920412133, + "grad_norm": 0.1493569314479828, + "learning_rate": 0.001, + "loss": 2.2034, + "num_input_tokens_seen": 18949959648, + "step": 36150 + }, + { + "epoch": 0.3453498217203096, + "grad_norm": 0.13502757251262665, + "learning_rate": 0.001, + "loss": 2.2005, + "num_input_tokens_seen": 18976172032, + "step": 36200 + }, + { + "epoch": 0.34582682423649785, + "grad_norm": 0.148860901594162, + "learning_rate": 0.001, + "loss": 2.1922, + "num_input_tokens_seen": 19002385664, + "step": 36250 + }, + { + "epoch": 0.3463038267526861, + "grad_norm": 0.13627836108207703, + "learning_rate": 0.001, + "loss": 2.2087, + "num_input_tokens_seen": 19028600064, + "step": 36300 + }, + { + "epoch": 0.3467808292688744, + "grad_norm": 0.1387259066104889, + "learning_rate": 0.001, + "loss": 2.1953, + "num_input_tokens_seen": 19054810240, + "step": 36350 + }, + { + "epoch": 0.34725783178506264, + "grad_norm": 0.15676312148571014, + "learning_rate": 0.001, + "loss": 2.2, + "num_input_tokens_seen": 19081021984, + "step": 36400 + }, + { + "epoch": 0.34773483430125096, + "grad_norm": 0.1438487023115158, + "learning_rate": 0.001, + "loss": 2.2066, + "num_input_tokens_seen": 19107227616, + "step": 36450 + }, + { + "epoch": 0.3482118368174392, + "grad_norm": 0.1509568840265274, + "learning_rate": 0.001, + "loss": 2.201, + "num_input_tokens_seen": 19133442016, + "step": 36500 + }, + { + "epoch": 0.3482118368174392, + "eval_loss": 2.117922306060791, + "eval_runtime": 82.7969, + "eval_samples_per_second": 60.389, + "eval_steps_per_second": 15.097, + "num_input_tokens_seen": 19133442016, + "step": 36500 + }, + { + "epoch": 0.3486888393336275, + "grad_norm": 0.14427947998046875, + "learning_rate": 0.001, + "loss": 2.2007, + "num_input_tokens_seen": 19159654944, + "step": 36550 + }, + { + "epoch": 0.34916584184981575, + "grad_norm": 0.1476566195487976, + "learning_rate": 0.001, + "loss": 2.197, + "num_input_tokens_seen": 19185859168, + "step": 36600 + }, + { + "epoch": 0.349642844366004, + "grad_norm": 0.13864775002002716, + "learning_rate": 0.001, + "loss": 2.201, + "num_input_tokens_seen": 19212073568, + "step": 36650 + }, + { + "epoch": 0.3501198468821923, + "grad_norm": 0.14173488318920135, + "learning_rate": 0.001, + "loss": 2.2017, + "num_input_tokens_seen": 19238287968, + "step": 36700 + }, + { + "epoch": 0.3505968493983806, + "grad_norm": 0.1381407082080841, + "learning_rate": 0.001, + "loss": 2.2103, + "num_input_tokens_seen": 19264502368, + "step": 36750 + }, + { + "epoch": 0.35107385191456886, + "grad_norm": 0.15552376210689545, + "learning_rate": 0.001, + "loss": 2.2028, + "num_input_tokens_seen": 19290709856, + "step": 36800 + }, + { + "epoch": 0.3515508544307571, + "grad_norm": 0.15400223433971405, + "learning_rate": 0.001, + "loss": 2.1996, + "num_input_tokens_seen": 19316919616, + "step": 36850 + }, + { + "epoch": 0.3520278569469454, + "grad_norm": 0.14455287158489227, + "learning_rate": 0.001, + "loss": 2.1958, + "num_input_tokens_seen": 19343128480, + "step": 36900 + }, + { + "epoch": 0.35250485946313365, + "grad_norm": 0.13802266120910645, + "learning_rate": 0.001, + "loss": 2.2072, + "num_input_tokens_seen": 19369341216, + "step": 36950 + }, + { + "epoch": 0.35298186197932196, + "grad_norm": 0.1613599807024002, + "learning_rate": 0.001, + "loss": 2.1968, + "num_input_tokens_seen": 19395541920, + "step": 37000 + }, + { + "epoch": 0.35298186197932196, + "eval_loss": 2.116856336593628, + "eval_runtime": 82.1293, + "eval_samples_per_second": 60.88, + "eval_steps_per_second": 15.22, + "num_input_tokens_seen": 19395541920, + "step": 37000 + }, + { + "epoch": 0.3534588644955102, + "grad_norm": 0.14134734869003296, + "learning_rate": 0.001, + "loss": 2.1956, + "num_input_tokens_seen": 19421746432, + "step": 37050 + }, + { + "epoch": 0.3539358670116985, + "grad_norm": 0.13640139997005463, + "learning_rate": 0.001, + "loss": 2.1917, + "num_input_tokens_seen": 19447955168, + "step": 37100 + }, + { + "epoch": 0.35441286952788675, + "grad_norm": 0.15265345573425293, + "learning_rate": 0.001, + "loss": 2.1883, + "num_input_tokens_seen": 19474169568, + "step": 37150 + }, + { + "epoch": 0.354889872044075, + "grad_norm": 0.1389395147562027, + "learning_rate": 0.001, + "loss": 2.2005, + "num_input_tokens_seen": 19500383968, + "step": 37200 + }, + { + "epoch": 0.3553668745602633, + "grad_norm": 0.1275651752948761, + "learning_rate": 0.001, + "loss": 2.203, + "num_input_tokens_seen": 19526592832, + "step": 37250 + }, + { + "epoch": 0.3558438770764516, + "grad_norm": 0.1409410983324051, + "learning_rate": 0.001, + "loss": 2.204, + "num_input_tokens_seen": 19552803712, + "step": 37300 + }, + { + "epoch": 0.35632087959263986, + "grad_norm": 0.14228691160678864, + "learning_rate": 0.001, + "loss": 2.2163, + "num_input_tokens_seen": 19579003200, + "step": 37350 + }, + { + "epoch": 0.3567978821088281, + "grad_norm": 0.13135015964508057, + "learning_rate": 0.001, + "loss": 2.1991, + "num_input_tokens_seen": 19605215264, + "step": 37400 + }, + { + "epoch": 0.3572748846250164, + "grad_norm": 0.14535236358642578, + "learning_rate": 0.001, + "loss": 2.2015, + "num_input_tokens_seen": 19631429664, + "step": 37450 + }, + { + "epoch": 0.35775188714120465, + "grad_norm": 0.14450703561306, + "learning_rate": 0.001, + "loss": 2.204, + "num_input_tokens_seen": 19657633088, + "step": 37500 + }, + { + "epoch": 0.35775188714120465, + "eval_loss": 2.1144189834594727, + "eval_runtime": 82.3004, + "eval_samples_per_second": 60.753, + "eval_steps_per_second": 15.188, + "num_input_tokens_seen": 19657633088, + "step": 37500 + }, + { + "epoch": 0.35822888965739297, + "grad_norm": 0.14302517473697662, + "learning_rate": 0.001, + "loss": 2.2022, + "num_input_tokens_seen": 19683846464, + "step": 37550 + }, + { + "epoch": 0.35870589217358123, + "grad_norm": 0.13624528050422668, + "learning_rate": 0.001, + "loss": 2.2009, + "num_input_tokens_seen": 19710059744, + "step": 37600 + }, + { + "epoch": 0.3591828946897695, + "grad_norm": 0.14689582586288452, + "learning_rate": 0.001, + "loss": 2.1933, + "num_input_tokens_seen": 19736267520, + "step": 37650 + }, + { + "epoch": 0.35965989720595776, + "grad_norm": 0.1342869997024536, + "learning_rate": 0.001, + "loss": 2.1965, + "num_input_tokens_seen": 19762479264, + "step": 37700 + }, + { + "epoch": 0.360136899722146, + "grad_norm": 0.14294207096099854, + "learning_rate": 0.001, + "loss": 2.207, + "num_input_tokens_seen": 19788688896, + "step": 37750 + }, + { + "epoch": 0.3606139022383343, + "grad_norm": 0.13254527747631073, + "learning_rate": 0.001, + "loss": 2.2012, + "num_input_tokens_seen": 19814900576, + "step": 37800 + }, + { + "epoch": 0.3610909047545226, + "grad_norm": 0.1584700047969818, + "learning_rate": 0.001, + "loss": 2.1898, + "num_input_tokens_seen": 19841105472, + "step": 37850 + }, + { + "epoch": 0.36156790727071086, + "grad_norm": 0.14291736483573914, + "learning_rate": 0.001, + "loss": 2.1938, + "num_input_tokens_seen": 19867318208, + "step": 37900 + }, + { + "epoch": 0.3620449097868991, + "grad_norm": 0.13364264369010925, + "learning_rate": 0.001, + "loss": 2.1939, + "num_input_tokens_seen": 19893530176, + "step": 37950 + }, + { + "epoch": 0.3625219123030874, + "grad_norm": 0.1309192031621933, + "learning_rate": 0.001, + "loss": 2.1979, + "num_input_tokens_seen": 19919735840, + "step": 38000 + }, + { + "epoch": 0.3625219123030874, + "eval_loss": 2.1131739616394043, + "eval_runtime": 82.6146, + "eval_samples_per_second": 60.522, + "eval_steps_per_second": 15.131, + "num_input_tokens_seen": 19919735840, + "step": 38000 + }, + { + "epoch": 0.36299891481927565, + "grad_norm": 0.14044371247291565, + "learning_rate": 0.001, + "loss": 2.1977, + "num_input_tokens_seen": 19945948672, + "step": 38050 + }, + { + "epoch": 0.36347591733546397, + "grad_norm": 0.1485033631324768, + "learning_rate": 0.001, + "loss": 2.2044, + "num_input_tokens_seen": 19972149696, + "step": 38100 + }, + { + "epoch": 0.36395291985165223, + "grad_norm": 0.14868605136871338, + "learning_rate": 0.001, + "loss": 2.181, + "num_input_tokens_seen": 19998356640, + "step": 38150 + }, + { + "epoch": 0.3644299223678405, + "grad_norm": 0.1402069628238678, + "learning_rate": 0.001, + "loss": 2.2014, + "num_input_tokens_seen": 20024569184, + "step": 38200 + }, + { + "epoch": 0.36490692488402876, + "grad_norm": 0.14594897627830505, + "learning_rate": 0.001, + "loss": 2.203, + "num_input_tokens_seen": 20050777216, + "step": 38250 + }, + { + "epoch": 0.365383927400217, + "grad_norm": 0.13246594369411469, + "learning_rate": 0.001, + "loss": 2.2107, + "num_input_tokens_seen": 20076989984, + "step": 38300 + }, + { + "epoch": 0.3658609299164053, + "grad_norm": 0.1423010230064392, + "learning_rate": 0.001, + "loss": 2.2007, + "num_input_tokens_seen": 20103199008, + "step": 38350 + }, + { + "epoch": 0.3663379324325936, + "grad_norm": 0.1386810839176178, + "learning_rate": 0.001, + "loss": 2.187, + "num_input_tokens_seen": 20129413248, + "step": 38400 + }, + { + "epoch": 0.36681493494878187, + "grad_norm": 0.1479010432958603, + "learning_rate": 0.001, + "loss": 2.1924, + "num_input_tokens_seen": 20155625408, + "step": 38450 + }, + { + "epoch": 0.36729193746497013, + "grad_norm": 0.14512768387794495, + "learning_rate": 0.001, + "loss": 2.1898, + "num_input_tokens_seen": 20181833600, + "step": 38500 + }, + { + "epoch": 0.36729193746497013, + "eval_loss": 2.111013174057007, + "eval_runtime": 82.3701, + "eval_samples_per_second": 60.702, + "eval_steps_per_second": 15.175, + "num_input_tokens_seen": 20181833600, + "step": 38500 + }, + { + "epoch": 0.3677689399811584, + "grad_norm": 0.14894255995750427, + "learning_rate": 0.001, + "loss": 2.1869, + "num_input_tokens_seen": 20208045120, + "step": 38550 + }, + { + "epoch": 0.36824594249734666, + "grad_norm": 0.140294149518013, + "learning_rate": 0.001, + "loss": 2.2048, + "num_input_tokens_seen": 20234258528, + "step": 38600 + }, + { + "epoch": 0.3687229450135349, + "grad_norm": 0.14766459167003632, + "learning_rate": 0.001, + "loss": 2.1961, + "num_input_tokens_seen": 20260469632, + "step": 38650 + }, + { + "epoch": 0.36919994752972324, + "grad_norm": 0.1636905074119568, + "learning_rate": 0.001, + "loss": 2.1922, + "num_input_tokens_seen": 20286670944, + "step": 38700 + }, + { + "epoch": 0.3696769500459115, + "grad_norm": 0.14300905168056488, + "learning_rate": 0.001, + "loss": 2.2001, + "num_input_tokens_seen": 20312883104, + "step": 38750 + }, + { + "epoch": 0.37015395256209976, + "grad_norm": 0.14150425791740417, + "learning_rate": 0.001, + "loss": 2.1913, + "num_input_tokens_seen": 20339087360, + "step": 38800 + }, + { + "epoch": 0.370630955078288, + "grad_norm": 0.13723760843276978, + "learning_rate": 0.001, + "loss": 2.2031, + "num_input_tokens_seen": 20365290304, + "step": 38850 + }, + { + "epoch": 0.3711079575944763, + "grad_norm": 0.15464797616004944, + "learning_rate": 0.001, + "loss": 2.1848, + "num_input_tokens_seen": 20391495936, + "step": 38900 + }, + { + "epoch": 0.3715849601106646, + "grad_norm": 0.15645267069339752, + "learning_rate": 0.001, + "loss": 2.2064, + "num_input_tokens_seen": 20417707040, + "step": 38950 + }, + { + "epoch": 0.37206196262685287, + "grad_norm": 0.15567755699157715, + "learning_rate": 0.001, + "loss": 2.1859, + "num_input_tokens_seen": 20443920960, + "step": 39000 + }, + { + "epoch": 0.37206196262685287, + "eval_loss": 2.110989570617676, + "eval_runtime": 82.2487, + "eval_samples_per_second": 60.791, + "eval_steps_per_second": 15.198, + "num_input_tokens_seen": 20443920960, + "step": 39000 + }, + { + "epoch": 0.37253896514304113, + "grad_norm": 0.15416857600212097, + "learning_rate": 0.001, + "loss": 2.1945, + "num_input_tokens_seen": 20470129312, + "step": 39050 + }, + { + "epoch": 0.3730159676592294, + "grad_norm": 0.1459367424249649, + "learning_rate": 0.001, + "loss": 2.1895, + "num_input_tokens_seen": 20496341472, + "step": 39100 + }, + { + "epoch": 0.37349297017541766, + "grad_norm": 0.14670804142951965, + "learning_rate": 0.001, + "loss": 2.1992, + "num_input_tokens_seen": 20522548320, + "step": 39150 + }, + { + "epoch": 0.3739699726916059, + "grad_norm": 0.14550630748271942, + "learning_rate": 0.001, + "loss": 2.2061, + "num_input_tokens_seen": 20548758304, + "step": 39200 + }, + { + "epoch": 0.37444697520779424, + "grad_norm": 0.1529083102941513, + "learning_rate": 0.001, + "loss": 2.1914, + "num_input_tokens_seen": 20574971520, + "step": 39250 + }, + { + "epoch": 0.3749239777239825, + "grad_norm": 0.13946719467639923, + "learning_rate": 0.001, + "loss": 2.1839, + "num_input_tokens_seen": 20601184288, + "step": 39300 + }, + { + "epoch": 0.37540098024017077, + "grad_norm": 0.1529141515493393, + "learning_rate": 0.001, + "loss": 2.1962, + "num_input_tokens_seen": 20627397056, + "step": 39350 + }, + { + "epoch": 0.37587798275635903, + "grad_norm": 0.14381681382656097, + "learning_rate": 0.001, + "loss": 2.1934, + "num_input_tokens_seen": 20653601696, + "step": 39400 + }, + { + "epoch": 0.3763549852725473, + "grad_norm": 0.1383078545331955, + "learning_rate": 0.001, + "loss": 2.2031, + "num_input_tokens_seen": 20679806784, + "step": 39450 + }, + { + "epoch": 0.3768319877887356, + "grad_norm": 0.14761337637901306, + "learning_rate": 0.001, + "loss": 2.188, + "num_input_tokens_seen": 20706018592, + "step": 39500 + }, + { + "epoch": 0.3768319877887356, + "eval_loss": 2.1095550060272217, + "eval_runtime": 82.0109, + "eval_samples_per_second": 60.968, + "eval_steps_per_second": 15.242, + "num_input_tokens_seen": 20706018592, + "step": 39500 + }, + { + "epoch": 0.3773089903049239, + "grad_norm": 0.14556308090686798, + "learning_rate": 0.001, + "loss": 2.1888, + "num_input_tokens_seen": 20732225440, + "step": 39550 + }, + { + "epoch": 0.37778599282111214, + "grad_norm": 0.14401084184646606, + "learning_rate": 0.001, + "loss": 2.1913, + "num_input_tokens_seen": 20758439616, + "step": 39600 + }, + { + "epoch": 0.3782629953373004, + "grad_norm": 0.14126934111118317, + "learning_rate": 0.001, + "loss": 2.1935, + "num_input_tokens_seen": 20784651648, + "step": 39650 + }, + { + "epoch": 0.37873999785348866, + "grad_norm": 0.1369311660528183, + "learning_rate": 0.001, + "loss": 2.191, + "num_input_tokens_seen": 20810862880, + "step": 39700 + }, + { + "epoch": 0.3792170003696769, + "grad_norm": 0.1367167979478836, + "learning_rate": 0.001, + "loss": 2.1953, + "num_input_tokens_seen": 20837073536, + "step": 39750 + }, + { + "epoch": 0.37969400288586524, + "grad_norm": 0.14433887600898743, + "learning_rate": 0.001, + "loss": 2.1898, + "num_input_tokens_seen": 20863285568, + "step": 39800 + }, + { + "epoch": 0.3801710054020535, + "grad_norm": 0.13510778546333313, + "learning_rate": 0.001, + "loss": 2.1825, + "num_input_tokens_seen": 20889491744, + "step": 39850 + }, + { + "epoch": 0.38064800791824177, + "grad_norm": 0.15846163034439087, + "learning_rate": 0.001, + "loss": 2.1977, + "num_input_tokens_seen": 20915690304, + "step": 39900 + }, + { + "epoch": 0.38112501043443003, + "grad_norm": 0.14499403536319733, + "learning_rate": 0.001, + "loss": 2.1809, + "num_input_tokens_seen": 20941902336, + "step": 39950 + }, + { + "epoch": 0.3816020129506183, + "grad_norm": 0.14737826585769653, + "learning_rate": 0.001, + "loss": 2.1932, + "num_input_tokens_seen": 20968112960, + "step": 40000 + }, + { + "epoch": 0.3816020129506183, + "eval_loss": 2.108198881149292, + "eval_runtime": 81.3944, + "eval_samples_per_second": 61.429, + "eval_steps_per_second": 15.357, + "num_input_tokens_seen": 20968112960, + "step": 40000 + }, + { + "epoch": 0.38207901546680656, + "grad_norm": 0.14910832047462463, + "learning_rate": 0.001, + "loss": 2.195, + "num_input_tokens_seen": 20994316064, + "step": 40050 + }, + { + "epoch": 0.3825560179829949, + "grad_norm": 0.14551687240600586, + "learning_rate": 0.001, + "loss": 2.1933, + "num_input_tokens_seen": 21020528128, + "step": 40100 + }, + { + "epoch": 0.38303302049918314, + "grad_norm": 0.13713929057121277, + "learning_rate": 0.001, + "loss": 2.1921, + "num_input_tokens_seen": 21046742528, + "step": 40150 + }, + { + "epoch": 0.3835100230153714, + "grad_norm": 0.1518137902021408, + "learning_rate": 0.001, + "loss": 2.1767, + "num_input_tokens_seen": 21072955296, + "step": 40200 + }, + { + "epoch": 0.38398702553155967, + "grad_norm": 0.14563234150409698, + "learning_rate": 0.001, + "loss": 2.1913, + "num_input_tokens_seen": 21099166048, + "step": 40250 + }, + { + "epoch": 0.38446402804774793, + "grad_norm": 0.14845997095108032, + "learning_rate": 0.001, + "loss": 2.1935, + "num_input_tokens_seen": 21125378816, + "step": 40300 + }, + { + "epoch": 0.38494103056393625, + "grad_norm": 0.15191951394081116, + "learning_rate": 0.001, + "loss": 2.1873, + "num_input_tokens_seen": 21151588800, + "step": 40350 + }, + { + "epoch": 0.3854180330801245, + "grad_norm": 0.13842670619487762, + "learning_rate": 0.001, + "loss": 2.1851, + "num_input_tokens_seen": 21177797408, + "step": 40400 + }, + { + "epoch": 0.3858950355963128, + "grad_norm": 0.1450972706079483, + "learning_rate": 0.001, + "loss": 2.1998, + "num_input_tokens_seen": 21204010688, + "step": 40450 + }, + { + "epoch": 0.38637203811250104, + "grad_norm": 0.14317825436592102, + "learning_rate": 0.001, + "loss": 2.1933, + "num_input_tokens_seen": 21230223584, + "step": 40500 + }, + { + "epoch": 0.38637203811250104, + "eval_loss": 2.1045005321502686, + "eval_runtime": 82.0473, + "eval_samples_per_second": 60.94, + "eval_steps_per_second": 15.235, + "num_input_tokens_seen": 21230223584, + "step": 40500 + }, + { + "epoch": 0.3868490406286893, + "grad_norm": 0.14561642706394196, + "learning_rate": 0.001, + "loss": 2.2001, + "num_input_tokens_seen": 21256431712, + "step": 40550 + }, + { + "epoch": 0.38732604314487756, + "grad_norm": 0.13989581167697906, + "learning_rate": 0.001, + "loss": 2.1761, + "num_input_tokens_seen": 21282636640, + "step": 40600 + }, + { + "epoch": 0.3878030456610659, + "grad_norm": 0.21038071811199188, + "learning_rate": 0.001, + "loss": 2.1906, + "num_input_tokens_seen": 21308851040, + "step": 40650 + }, + { + "epoch": 0.38828004817725414, + "grad_norm": 0.1928381323814392, + "learning_rate": 0.001, + "loss": 2.187, + "num_input_tokens_seen": 21335059968, + "step": 40700 + }, + { + "epoch": 0.3887570506934424, + "grad_norm": 0.1544865220785141, + "learning_rate": 0.001, + "loss": 2.1866, + "num_input_tokens_seen": 21361274016, + "step": 40750 + }, + { + "epoch": 0.38923405320963067, + "grad_norm": 0.13268694281578064, + "learning_rate": 0.001, + "loss": 2.1874, + "num_input_tokens_seen": 21387478624, + "step": 40800 + }, + { + "epoch": 0.38971105572581893, + "grad_norm": 0.15970471501350403, + "learning_rate": 0.001, + "loss": 2.1942, + "num_input_tokens_seen": 21413693024, + "step": 40850 + }, + { + "epoch": 0.39018805824200725, + "grad_norm": 0.15078318119049072, + "learning_rate": 0.001, + "loss": 2.1837, + "num_input_tokens_seen": 21439892672, + "step": 40900 + }, + { + "epoch": 0.3906650607581955, + "grad_norm": 0.14016783237457275, + "learning_rate": 0.001, + "loss": 2.1993, + "num_input_tokens_seen": 21466102688, + "step": 40950 + }, + { + "epoch": 0.3911420632743838, + "grad_norm": 0.13748787343502045, + "learning_rate": 0.001, + "loss": 2.1907, + "num_input_tokens_seen": 21492310496, + "step": 41000 + }, + { + "epoch": 0.3911420632743838, + "eval_loss": 2.105747699737549, + "eval_runtime": 81.8993, + "eval_samples_per_second": 61.051, + "eval_steps_per_second": 15.263, + "num_input_tokens_seen": 21492310496, + "step": 41000 + }, + { + "epoch": 0.39161906579057204, + "grad_norm": 0.136869415640831, + "learning_rate": 0.001, + "loss": 2.1911, + "num_input_tokens_seen": 21518520608, + "step": 41050 + }, + { + "epoch": 0.3920960683067603, + "grad_norm": 0.15758763253688812, + "learning_rate": 0.001, + "loss": 2.1803, + "num_input_tokens_seen": 21544732864, + "step": 41100 + }, + { + "epoch": 0.39257307082294857, + "grad_norm": 0.1424143761396408, + "learning_rate": 0.001, + "loss": 2.1903, + "num_input_tokens_seen": 21570943840, + "step": 41150 + }, + { + "epoch": 0.3930500733391369, + "grad_norm": 0.14572599530220032, + "learning_rate": 0.001, + "loss": 2.1822, + "num_input_tokens_seen": 21597153312, + "step": 41200 + }, + { + "epoch": 0.39352707585532515, + "grad_norm": 0.1391698122024536, + "learning_rate": 0.001, + "loss": 2.1909, + "num_input_tokens_seen": 21623366976, + "step": 41250 + }, + { + "epoch": 0.3940040783715134, + "grad_norm": 0.13990794122219086, + "learning_rate": 0.001, + "loss": 2.1997, + "num_input_tokens_seen": 21649573728, + "step": 41300 + }, + { + "epoch": 0.3944810808877017, + "grad_norm": 0.1600644737482071, + "learning_rate": 0.001, + "loss": 2.186, + "num_input_tokens_seen": 21675788128, + "step": 41350 + }, + { + "epoch": 0.39495808340388994, + "grad_norm": 0.1378026008605957, + "learning_rate": 0.001, + "loss": 2.1815, + "num_input_tokens_seen": 21701992096, + "step": 41400 + }, + { + "epoch": 0.39543508592007826, + "grad_norm": 0.13701239228248596, + "learning_rate": 0.001, + "loss": 2.195, + "num_input_tokens_seen": 21728199616, + "step": 41450 + }, + { + "epoch": 0.3959120884362665, + "grad_norm": 0.1407209187746048, + "learning_rate": 0.001, + "loss": 2.1806, + "num_input_tokens_seen": 21754409120, + "step": 41500 + }, + { + "epoch": 0.3959120884362665, + "eval_loss": 2.1029863357543945, + "eval_runtime": 82.1541, + "eval_samples_per_second": 60.861, + "eval_steps_per_second": 15.215, + "num_input_tokens_seen": 21754409120, + "step": 41500 + }, + { + "epoch": 0.3963890909524548, + "grad_norm": 0.1486450433731079, + "learning_rate": 0.001, + "loss": 2.1894, + "num_input_tokens_seen": 21780617152, + "step": 41550 + }, + { + "epoch": 0.39686609346864304, + "grad_norm": 0.14881809055805206, + "learning_rate": 0.001, + "loss": 2.1764, + "num_input_tokens_seen": 21806827168, + "step": 41600 + }, + { + "epoch": 0.3973430959848313, + "grad_norm": 0.15954989194869995, + "learning_rate": 0.001, + "loss": 2.195, + "num_input_tokens_seen": 21833038464, + "step": 41650 + }, + { + "epoch": 0.39782009850101957, + "grad_norm": 0.14994557201862335, + "learning_rate": 0.001, + "loss": 2.1934, + "num_input_tokens_seen": 21859246336, + "step": 41700 + }, + { + "epoch": 0.3982971010172079, + "grad_norm": 0.1431296467781067, + "learning_rate": 0.001, + "loss": 2.1858, + "num_input_tokens_seen": 21885459520, + "step": 41750 + }, + { + "epoch": 0.39877410353339615, + "grad_norm": 0.1418553739786148, + "learning_rate": 0.001, + "loss": 2.1859, + "num_input_tokens_seen": 21911671392, + "step": 41800 + }, + { + "epoch": 0.3992511060495844, + "grad_norm": 0.1425972878932953, + "learning_rate": 0.001, + "loss": 2.1917, + "num_input_tokens_seen": 21937879584, + "step": 41850 + }, + { + "epoch": 0.3997281085657727, + "grad_norm": 0.13912352919578552, + "learning_rate": 0.001, + "loss": 2.19, + "num_input_tokens_seen": 21964082080, + "step": 41900 + }, + { + "epoch": 0.40020511108196094, + "grad_norm": 0.16832081973552704, + "learning_rate": 0.001, + "loss": 2.181, + "num_input_tokens_seen": 21990294784, + "step": 41950 + }, + { + "epoch": 0.4006821135981492, + "grad_norm": 0.14969058334827423, + "learning_rate": 0.001, + "loss": 2.1834, + "num_input_tokens_seen": 22016505376, + "step": 42000 + }, + { + "epoch": 0.4006821135981492, + "eval_loss": 2.1013541221618652, + "eval_runtime": 82.4496, + "eval_samples_per_second": 60.643, + "eval_steps_per_second": 15.161, + "num_input_tokens_seen": 22016505376, + "step": 42000 + }, + { + "epoch": 0.4011591161143375, + "grad_norm": 0.14711995422840118, + "learning_rate": 0.001, + "loss": 2.1871, + "num_input_tokens_seen": 22042718080, + "step": 42050 + }, + { + "epoch": 0.4016361186305258, + "grad_norm": 0.14118188619613647, + "learning_rate": 0.001, + "loss": 2.1968, + "num_input_tokens_seen": 22068929760, + "step": 42100 + }, + { + "epoch": 0.40211312114671405, + "grad_norm": 0.14557373523712158, + "learning_rate": 0.001, + "loss": 2.1815, + "num_input_tokens_seen": 22095140192, + "step": 42150 + }, + { + "epoch": 0.4025901236629023, + "grad_norm": 0.14241378009319305, + "learning_rate": 0.001, + "loss": 2.1841, + "num_input_tokens_seen": 22121353760, + "step": 42200 + }, + { + "epoch": 0.4030671261790906, + "grad_norm": 0.14654819667339325, + "learning_rate": 0.001, + "loss": 2.1857, + "num_input_tokens_seen": 22147565312, + "step": 42250 + }, + { + "epoch": 0.4035441286952789, + "grad_norm": 0.14023630321025848, + "learning_rate": 0.001, + "loss": 2.185, + "num_input_tokens_seen": 22173773088, + "step": 42300 + }, + { + "epoch": 0.40402113121146715, + "grad_norm": 0.1503324657678604, + "learning_rate": 0.001, + "loss": 2.1719, + "num_input_tokens_seen": 22199972064, + "step": 42350 + }, + { + "epoch": 0.4044981337276554, + "grad_norm": 0.148145854473114, + "learning_rate": 0.001, + "loss": 2.1854, + "num_input_tokens_seen": 22226182336, + "step": 42400 + }, + { + "epoch": 0.4049751362438437, + "grad_norm": 0.14223705232143402, + "learning_rate": 0.001, + "loss": 2.1941, + "num_input_tokens_seen": 22252394144, + "step": 42450 + }, + { + "epoch": 0.40545213876003194, + "grad_norm": 0.15040171146392822, + "learning_rate": 0.001, + "loss": 2.1914, + "num_input_tokens_seen": 22278605888, + "step": 42500 + }, + { + "epoch": 0.40545213876003194, + "eval_loss": 2.101572036743164, + "eval_runtime": 82.9065, + "eval_samples_per_second": 60.309, + "eval_steps_per_second": 15.077, + "num_input_tokens_seen": 22278605888, + "step": 42500 + }, + { + "epoch": 0.4059291412762202, + "grad_norm": 0.1579235941171646, + "learning_rate": 0.001, + "loss": 2.1765, + "num_input_tokens_seen": 22304819520, + "step": 42550 + }, + { + "epoch": 0.4064061437924085, + "grad_norm": 0.16040007770061493, + "learning_rate": 0.001, + "loss": 2.1853, + "num_input_tokens_seen": 22331025472, + "step": 42600 + }, + { + "epoch": 0.4068831463085968, + "grad_norm": 0.14831505715847015, + "learning_rate": 0.001, + "loss": 2.1894, + "num_input_tokens_seen": 22357235040, + "step": 42650 + }, + { + "epoch": 0.40736014882478505, + "grad_norm": 0.1373136043548584, + "learning_rate": 0.001, + "loss": 2.1872, + "num_input_tokens_seen": 22383446240, + "step": 42700 + }, + { + "epoch": 0.4078371513409733, + "grad_norm": 0.13545425236225128, + "learning_rate": 0.001, + "loss": 2.1964, + "num_input_tokens_seen": 22409660576, + "step": 42750 + }, + { + "epoch": 0.4083141538571616, + "grad_norm": 0.1480574756860733, + "learning_rate": 0.001, + "loss": 2.1892, + "num_input_tokens_seen": 22435870080, + "step": 42800 + }, + { + "epoch": 0.4087911563733499, + "grad_norm": 0.14833049476146698, + "learning_rate": 0.001, + "loss": 2.1793, + "num_input_tokens_seen": 22462079680, + "step": 42850 + }, + { + "epoch": 0.40926815888953816, + "grad_norm": 0.1327161192893982, + "learning_rate": 0.001, + "loss": 2.1834, + "num_input_tokens_seen": 22488293312, + "step": 42900 + }, + { + "epoch": 0.4097451614057264, + "grad_norm": 0.15992066264152527, + "learning_rate": 0.001, + "loss": 2.1759, + "num_input_tokens_seen": 22514505696, + "step": 42950 + }, + { + "epoch": 0.4102221639219147, + "grad_norm": 0.14314264059066772, + "learning_rate": 0.001, + "loss": 2.1932, + "num_input_tokens_seen": 22540715296, + "step": 43000 + }, + { + "epoch": 0.4102221639219147, + "eval_loss": 2.0989809036254883, + "eval_runtime": 82.6516, + "eval_samples_per_second": 60.495, + "eval_steps_per_second": 15.124, + "num_input_tokens_seen": 22540715296, + "step": 43000 + }, + { + "epoch": 0.41069916643810295, + "grad_norm": 0.14374396204948425, + "learning_rate": 0.001, + "loss": 2.1741, + "num_input_tokens_seen": 22566918720, + "step": 43050 + }, + { + "epoch": 0.4111761689542912, + "grad_norm": 0.14959892630577087, + "learning_rate": 0.001, + "loss": 2.1778, + "num_input_tokens_seen": 22593129408, + "step": 43100 + }, + { + "epoch": 0.41165317147047953, + "grad_norm": 0.13896231353282928, + "learning_rate": 0.001, + "loss": 2.1778, + "num_input_tokens_seen": 22619343808, + "step": 43150 + }, + { + "epoch": 0.4121301739866678, + "grad_norm": 0.14940877258777618, + "learning_rate": 0.001, + "loss": 2.1903, + "num_input_tokens_seen": 22645555552, + "step": 43200 + }, + { + "epoch": 0.41260717650285605, + "grad_norm": 0.14699922502040863, + "learning_rate": 0.001, + "loss": 2.1882, + "num_input_tokens_seen": 22671765856, + "step": 43250 + }, + { + "epoch": 0.4130841790190443, + "grad_norm": 0.13644367456436157, + "learning_rate": 0.001, + "loss": 2.186, + "num_input_tokens_seen": 22697978816, + "step": 43300 + }, + { + "epoch": 0.4135611815352326, + "grad_norm": 0.13732574880123138, + "learning_rate": 0.001, + "loss": 2.1823, + "num_input_tokens_seen": 22724189600, + "step": 43350 + }, + { + "epoch": 0.4140381840514209, + "grad_norm": 0.20614680647850037, + "learning_rate": 0.001, + "loss": 2.204, + "num_input_tokens_seen": 22750393024, + "step": 43400 + }, + { + "epoch": 0.41451518656760916, + "grad_norm": 0.3353808522224426, + "learning_rate": 0.001, + "loss": 2.2219, + "num_input_tokens_seen": 22776601856, + "step": 43450 + }, + { + "epoch": 0.4149921890837974, + "grad_norm": 0.12795452773571014, + "learning_rate": 0.001, + "loss": 2.2209, + "num_input_tokens_seen": 22802815776, + "step": 43500 + }, + { + "epoch": 0.4149921890837974, + "eval_loss": 2.1085610389709473, + "eval_runtime": 81.8271, + "eval_samples_per_second": 61.104, + "eval_steps_per_second": 15.276, + "num_input_tokens_seen": 22802815776, + "step": 43500 + }, + { + "epoch": 0.4154691915999857, + "grad_norm": 0.1316380500793457, + "learning_rate": 0.001, + "loss": 2.1859, + "num_input_tokens_seen": 22829029248, + "step": 43550 + }, + { + "epoch": 0.41594619411617395, + "grad_norm": 0.14432553946971893, + "learning_rate": 0.001, + "loss": 2.1942, + "num_input_tokens_seen": 22855233984, + "step": 43600 + }, + { + "epoch": 0.4164231966323622, + "grad_norm": 0.14184366166591644, + "learning_rate": 0.001, + "loss": 2.1843, + "num_input_tokens_seen": 22881440160, + "step": 43650 + }, + { + "epoch": 0.41690019914855053, + "grad_norm": 0.13516154885292053, + "learning_rate": 0.001, + "loss": 2.1828, + "num_input_tokens_seen": 22907643456, + "step": 43700 + }, + { + "epoch": 0.4173772016647388, + "grad_norm": 0.14672012627124786, + "learning_rate": 0.001, + "loss": 2.1828, + "num_input_tokens_seen": 22933857824, + "step": 43750 + }, + { + "epoch": 0.41785420418092706, + "grad_norm": 0.15259918570518494, + "learning_rate": 0.001, + "loss": 2.1908, + "num_input_tokens_seen": 22960061120, + "step": 43800 + }, + { + "epoch": 0.4183312066971153, + "grad_norm": 0.1380903720855713, + "learning_rate": 0.001, + "loss": 2.1685, + "num_input_tokens_seen": 22986272704, + "step": 43850 + }, + { + "epoch": 0.4188082092133036, + "grad_norm": 0.149773970246315, + "learning_rate": 0.001, + "loss": 2.1773, + "num_input_tokens_seen": 23012484064, + "step": 43900 + }, + { + "epoch": 0.41928521172949185, + "grad_norm": 0.13098189234733582, + "learning_rate": 0.001, + "loss": 2.1854, + "num_input_tokens_seen": 23038695008, + "step": 43950 + }, + { + "epoch": 0.41976221424568017, + "grad_norm": 0.17141763865947723, + "learning_rate": 0.001, + "loss": 2.1856, + "num_input_tokens_seen": 23064909408, + "step": 44000 + }, + { + "epoch": 0.41976221424568017, + "eval_loss": 2.0980746746063232, + "eval_runtime": 82.0381, + "eval_samples_per_second": 60.947, + "eval_steps_per_second": 15.237, + "num_input_tokens_seen": 23064909408, + "step": 44000 + }, + { + "epoch": 0.42023921676186843, + "grad_norm": 0.12846800684928894, + "learning_rate": 0.001, + "loss": 2.1901, + "num_input_tokens_seen": 23091122912, + "step": 44050 + }, + { + "epoch": 0.4207162192780567, + "grad_norm": 0.14414989948272705, + "learning_rate": 0.001, + "loss": 2.1852, + "num_input_tokens_seen": 23117335840, + "step": 44100 + }, + { + "epoch": 0.42119322179424495, + "grad_norm": 0.13725394010543823, + "learning_rate": 0.001, + "loss": 2.192, + "num_input_tokens_seen": 23143542976, + "step": 44150 + }, + { + "epoch": 0.4216702243104332, + "grad_norm": 0.14113777875900269, + "learning_rate": 0.001, + "loss": 2.171, + "num_input_tokens_seen": 23169753472, + "step": 44200 + }, + { + "epoch": 0.42214722682662154, + "grad_norm": 0.1349649280309677, + "learning_rate": 0.001, + "loss": 2.174, + "num_input_tokens_seen": 23195956960, + "step": 44250 + }, + { + "epoch": 0.4226242293428098, + "grad_norm": 0.13642828166484833, + "learning_rate": 0.001, + "loss": 2.1926, + "num_input_tokens_seen": 23222168320, + "step": 44300 + }, + { + "epoch": 0.42310123185899806, + "grad_norm": 0.15120643377304077, + "learning_rate": 0.001, + "loss": 2.1745, + "num_input_tokens_seen": 23248378144, + "step": 44350 + }, + { + "epoch": 0.4235782343751863, + "grad_norm": 0.13176341354846954, + "learning_rate": 0.001, + "loss": 2.1863, + "num_input_tokens_seen": 23274588960, + "step": 44400 + }, + { + "epoch": 0.4240552368913746, + "grad_norm": 0.14402221143245697, + "learning_rate": 0.001, + "loss": 2.1782, + "num_input_tokens_seen": 23300803360, + "step": 44450 + }, + { + "epoch": 0.42453223940756285, + "grad_norm": 0.14719614386558533, + "learning_rate": 0.001, + "loss": 2.1823, + "num_input_tokens_seen": 23327017760, + "step": 44500 + }, + { + "epoch": 0.42453223940756285, + "eval_loss": 2.0960397720336914, + "eval_runtime": 82.6503, + "eval_samples_per_second": 60.496, + "eval_steps_per_second": 15.124, + "num_input_tokens_seen": 23327017760, + "step": 44500 + }, + { + "epoch": 0.42500924192375117, + "grad_norm": 0.14565804600715637, + "learning_rate": 0.001, + "loss": 2.1762, + "num_input_tokens_seen": 23353225568, + "step": 44550 + }, + { + "epoch": 0.42548624443993943, + "grad_norm": 0.13152356445789337, + "learning_rate": 0.001, + "loss": 2.1789, + "num_input_tokens_seen": 23379439744, + "step": 44600 + }, + { + "epoch": 0.4259632469561277, + "grad_norm": 0.1694796234369278, + "learning_rate": 0.001, + "loss": 2.1851, + "num_input_tokens_seen": 23405652928, + "step": 44650 + }, + { + "epoch": 0.42644024947231596, + "grad_norm": 0.14656352996826172, + "learning_rate": 0.001, + "loss": 2.1733, + "num_input_tokens_seen": 23431859328, + "step": 44700 + }, + { + "epoch": 0.4269172519885042, + "grad_norm": 0.15425816178321838, + "learning_rate": 0.001, + "loss": 2.1853, + "num_input_tokens_seen": 23458067136, + "step": 44750 + }, + { + "epoch": 0.42739425450469254, + "grad_norm": 0.14381302893161774, + "learning_rate": 0.001, + "loss": 2.1893, + "num_input_tokens_seen": 23484275232, + "step": 44800 + }, + { + "epoch": 0.4278712570208808, + "grad_norm": 0.14928653836250305, + "learning_rate": 0.001, + "loss": 2.1807, + "num_input_tokens_seen": 23510489632, + "step": 44850 + }, + { + "epoch": 0.42834825953706906, + "grad_norm": 0.15223214030265808, + "learning_rate": 0.001, + "loss": 2.1714, + "num_input_tokens_seen": 23536691232, + "step": 44900 + }, + { + "epoch": 0.42882526205325733, + "grad_norm": 0.14738094806671143, + "learning_rate": 0.001, + "loss": 2.17, + "num_input_tokens_seen": 23562905280, + "step": 44950 + }, + { + "epoch": 0.4293022645694456, + "grad_norm": 0.14292684197425842, + "learning_rate": 0.001, + "loss": 2.1862, + "num_input_tokens_seen": 23589115072, + "step": 45000 + }, + { + "epoch": 0.4293022645694456, + "eval_loss": 2.093562126159668, + "eval_runtime": 82.6691, + "eval_samples_per_second": 60.482, + "eval_steps_per_second": 15.121, + "num_input_tokens_seen": 23589115072, + "step": 45000 + }, + { + "epoch": 0.42977926708563385, + "grad_norm": 0.142947256565094, + "learning_rate": 0.001, + "loss": 2.1711, + "num_input_tokens_seen": 23615324192, + "step": 45050 + }, + { + "epoch": 0.43025626960182217, + "grad_norm": 0.14863619208335876, + "learning_rate": 0.001, + "loss": 2.1782, + "num_input_tokens_seen": 23641537056, + "step": 45100 + }, + { + "epoch": 0.43073327211801044, + "grad_norm": 0.1470208466053009, + "learning_rate": 0.001, + "loss": 2.1829, + "num_input_tokens_seen": 23667747168, + "step": 45150 + }, + { + "epoch": 0.4312102746341987, + "grad_norm": 0.1326986402273178, + "learning_rate": 0.001, + "loss": 2.1844, + "num_input_tokens_seen": 23693952480, + "step": 45200 + }, + { + "epoch": 0.43168727715038696, + "grad_norm": 0.14702260494232178, + "learning_rate": 0.001, + "loss": 2.1752, + "num_input_tokens_seen": 23720163968, + "step": 45250 + }, + { + "epoch": 0.4321642796665752, + "grad_norm": 0.14227628707885742, + "learning_rate": 0.001, + "loss": 2.1794, + "num_input_tokens_seen": 23746374976, + "step": 45300 + }, + { + "epoch": 0.4326412821827635, + "grad_norm": 0.15879526734352112, + "learning_rate": 0.001, + "loss": 2.1892, + "num_input_tokens_seen": 23772583744, + "step": 45350 + }, + { + "epoch": 0.4331182846989518, + "grad_norm": 0.14691776037216187, + "learning_rate": 0.001, + "loss": 2.1887, + "num_input_tokens_seen": 23798794624, + "step": 45400 + }, + { + "epoch": 0.43359528721514007, + "grad_norm": 0.1442701518535614, + "learning_rate": 0.001, + "loss": 2.1744, + "num_input_tokens_seen": 23824995936, + "step": 45450 + }, + { + "epoch": 0.43407228973132833, + "grad_norm": 0.14576993882656097, + "learning_rate": 0.001, + "loss": 2.182, + "num_input_tokens_seen": 23851210336, + "step": 45500 + }, + { + "epoch": 0.43407228973132833, + "eval_loss": 2.092682123184204, + "eval_runtime": 82.1558, + "eval_samples_per_second": 60.86, + "eval_steps_per_second": 15.215, + "num_input_tokens_seen": 23851210336, + "step": 45500 + }, + { + "epoch": 0.4345492922475166, + "grad_norm": 0.13830795884132385, + "learning_rate": 0.001, + "loss": 2.1708, + "num_input_tokens_seen": 23877410144, + "step": 45550 + }, + { + "epoch": 0.43502629476370486, + "grad_norm": 0.15177521109580994, + "learning_rate": 0.001, + "loss": 2.1688, + "num_input_tokens_seen": 23903612352, + "step": 45600 + }, + { + "epoch": 0.4355032972798932, + "grad_norm": 0.15547911822795868, + "learning_rate": 0.001, + "loss": 2.1825, + "num_input_tokens_seen": 23929814720, + "step": 45650 + }, + { + "epoch": 0.43598029979608144, + "grad_norm": 0.14573921263217926, + "learning_rate": 0.001, + "loss": 2.1749, + "num_input_tokens_seen": 23956024000, + "step": 45700 + }, + { + "epoch": 0.4364573023122697, + "grad_norm": 0.14333628118038177, + "learning_rate": 0.001, + "loss": 2.1751, + "num_input_tokens_seen": 23982236352, + "step": 45750 + }, + { + "epoch": 0.43693430482845796, + "grad_norm": 0.15511895716190338, + "learning_rate": 0.001, + "loss": 2.1775, + "num_input_tokens_seen": 24008446560, + "step": 45800 + }, + { + "epoch": 0.4374113073446462, + "grad_norm": 0.14994923770427704, + "learning_rate": 0.001, + "loss": 2.181, + "num_input_tokens_seen": 24034660960, + "step": 45850 + }, + { + "epoch": 0.4378883098608345, + "grad_norm": 0.13673779368400574, + "learning_rate": 0.001, + "loss": 2.1658, + "num_input_tokens_seen": 24060863552, + "step": 45900 + }, + { + "epoch": 0.4383653123770228, + "grad_norm": 0.13953204452991486, + "learning_rate": 0.001, + "loss": 2.1705, + "num_input_tokens_seen": 24087066784, + "step": 45950 + }, + { + "epoch": 0.43884231489321107, + "grad_norm": 0.14653468132019043, + "learning_rate": 0.001, + "loss": 2.1729, + "num_input_tokens_seen": 24113281184, + "step": 46000 + }, + { + "epoch": 0.43884231489321107, + "eval_loss": 2.091700792312622, + "eval_runtime": 82.8172, + "eval_samples_per_second": 60.374, + "eval_steps_per_second": 15.093, + "num_input_tokens_seen": 24113281184, + "step": 46000 + }, + { + "epoch": 0.43931931740939933, + "grad_norm": 0.1543819010257721, + "learning_rate": 0.001, + "loss": 2.1704, + "num_input_tokens_seen": 24139486272, + "step": 46050 + }, + { + "epoch": 0.4397963199255876, + "grad_norm": 0.15496985614299774, + "learning_rate": 0.001, + "loss": 2.1818, + "num_input_tokens_seen": 24165699840, + "step": 46100 + }, + { + "epoch": 0.44027332244177586, + "grad_norm": 0.15104669332504272, + "learning_rate": 0.001, + "loss": 2.1816, + "num_input_tokens_seen": 24191906496, + "step": 46150 + }, + { + "epoch": 0.4407503249579642, + "grad_norm": 0.14507949352264404, + "learning_rate": 0.001, + "loss": 2.1793, + "num_input_tokens_seen": 24218118464, + "step": 46200 + }, + { + "epoch": 0.44122732747415244, + "grad_norm": 0.14487695693969727, + "learning_rate": 0.001, + "loss": 2.1673, + "num_input_tokens_seen": 24244332864, + "step": 46250 + }, + { + "epoch": 0.4417043299903407, + "grad_norm": 0.1322576105594635, + "learning_rate": 0.001, + "loss": 2.1752, + "num_input_tokens_seen": 24270547264, + "step": 46300 + }, + { + "epoch": 0.44218133250652897, + "grad_norm": 0.13863323628902435, + "learning_rate": 0.001, + "loss": 2.1724, + "num_input_tokens_seen": 24296760224, + "step": 46350 + }, + { + "epoch": 0.44265833502271723, + "grad_norm": 0.1451748162508011, + "learning_rate": 0.001, + "loss": 2.1739, + "num_input_tokens_seen": 24322974624, + "step": 46400 + }, + { + "epoch": 0.4431353375389055, + "grad_norm": 0.15124155580997467, + "learning_rate": 0.001, + "loss": 2.1692, + "num_input_tokens_seen": 24349183648, + "step": 46450 + }, + { + "epoch": 0.4436123400550938, + "grad_norm": 0.14303581416606903, + "learning_rate": 0.001, + "loss": 2.177, + "num_input_tokens_seen": 24375397792, + "step": 46500 + }, + { + "epoch": 0.4436123400550938, + "eval_loss": 2.0904366970062256, + "eval_runtime": 82.436, + "eval_samples_per_second": 60.653, + "eval_steps_per_second": 15.163, + "num_input_tokens_seen": 24375397792, + "step": 46500 + }, + { + "epoch": 0.4440893425712821, + "grad_norm": 0.14103703200817108, + "learning_rate": 0.001, + "loss": 2.1682, + "num_input_tokens_seen": 24401608192, + "step": 46550 + }, + { + "epoch": 0.44456634508747034, + "grad_norm": 0.1284860521554947, + "learning_rate": 0.001, + "loss": 2.1704, + "num_input_tokens_seen": 24427820992, + "step": 46600 + }, + { + "epoch": 0.4450433476036586, + "grad_norm": 0.1443055421113968, + "learning_rate": 0.001, + "loss": 2.1686, + "num_input_tokens_seen": 24454025824, + "step": 46650 + }, + { + "epoch": 0.44552035011984686, + "grad_norm": 0.1435597836971283, + "learning_rate": 0.001, + "loss": 2.1806, + "num_input_tokens_seen": 24480239680, + "step": 46700 + }, + { + "epoch": 0.4459973526360352, + "grad_norm": 0.15132416784763336, + "learning_rate": 0.001, + "loss": 2.1737, + "num_input_tokens_seen": 24506453920, + "step": 46750 + }, + { + "epoch": 0.44647435515222345, + "grad_norm": 0.1403588205575943, + "learning_rate": 0.001, + "loss": 2.1701, + "num_input_tokens_seen": 24532664288, + "step": 46800 + }, + { + "epoch": 0.4469513576684117, + "grad_norm": 0.15247014164924622, + "learning_rate": 0.001, + "loss": 2.1694, + "num_input_tokens_seen": 24558878688, + "step": 46850 + }, + { + "epoch": 0.44742836018459997, + "grad_norm": 0.14112932980060577, + "learning_rate": 0.001, + "loss": 2.1797, + "num_input_tokens_seen": 24585085216, + "step": 46900 + }, + { + "epoch": 0.44790536270078823, + "grad_norm": 0.14278770983219147, + "learning_rate": 0.001, + "loss": 2.1671, + "num_input_tokens_seen": 24611299616, + "step": 46950 + }, + { + "epoch": 0.4483823652169765, + "grad_norm": 0.15718604624271393, + "learning_rate": 0.001, + "loss": 2.1674, + "num_input_tokens_seen": 24637513248, + "step": 47000 + }, + { + "epoch": 0.4483823652169765, + "eval_loss": 2.08896541595459, + "eval_runtime": 81.7194, + "eval_samples_per_second": 61.185, + "eval_steps_per_second": 15.296, + "num_input_tokens_seen": 24637513248, + "step": 47000 + }, + { + "epoch": 0.4488593677331648, + "grad_norm": 0.14600330591201782, + "learning_rate": 0.001, + "loss": 2.1773, + "num_input_tokens_seen": 24663726880, + "step": 47050 + }, + { + "epoch": 0.4493363702493531, + "grad_norm": 0.13896551728248596, + "learning_rate": 0.001, + "loss": 2.1699, + "num_input_tokens_seen": 24689934976, + "step": 47100 + }, + { + "epoch": 0.44981337276554134, + "grad_norm": 0.15189655125141144, + "learning_rate": 0.001, + "loss": 2.1747, + "num_input_tokens_seen": 24716146208, + "step": 47150 + }, + { + "epoch": 0.4502903752817296, + "grad_norm": 0.1438799947500229, + "learning_rate": 0.001, + "loss": 2.1754, + "num_input_tokens_seen": 24742351360, + "step": 47200 + }, + { + "epoch": 0.45076737779791787, + "grad_norm": 0.14087191224098206, + "learning_rate": 0.001, + "loss": 2.1659, + "num_input_tokens_seen": 24768557056, + "step": 47250 + }, + { + "epoch": 0.45124438031410613, + "grad_norm": 0.1569574773311615, + "learning_rate": 0.001, + "loss": 2.1765, + "num_input_tokens_seen": 24794768736, + "step": 47300 + }, + { + "epoch": 0.45172138283029445, + "grad_norm": 0.14594893157482147, + "learning_rate": 0.001, + "loss": 2.1867, + "num_input_tokens_seen": 24820973728, + "step": 47350 + }, + { + "epoch": 0.4521983853464827, + "grad_norm": 0.13743354380130768, + "learning_rate": 0.001, + "loss": 2.1671, + "num_input_tokens_seen": 24847180800, + "step": 47400 + }, + { + "epoch": 0.452675387862671, + "grad_norm": 0.14880713820457458, + "learning_rate": 0.001, + "loss": 2.1834, + "num_input_tokens_seen": 24873395200, + "step": 47450 + }, + { + "epoch": 0.45315239037885924, + "grad_norm": 0.13658978044986725, + "learning_rate": 0.001, + "loss": 2.1608, + "num_input_tokens_seen": 24899608000, + "step": 47500 + }, + { + "epoch": 0.45315239037885924, + "eval_loss": 2.0886528491973877, + "eval_runtime": 82.7799, + "eval_samples_per_second": 60.401, + "eval_steps_per_second": 15.1, + "num_input_tokens_seen": 24899608000, + "step": 47500 + }, + { + "epoch": 0.4536293928950475, + "grad_norm": 0.14707359671592712, + "learning_rate": 0.001, + "loss": 2.172, + "num_input_tokens_seen": 24925815680, + "step": 47550 + }, + { + "epoch": 0.4541063954112358, + "grad_norm": 0.16340535879135132, + "learning_rate": 0.001, + "loss": 2.1721, + "num_input_tokens_seen": 24952024960, + "step": 47600 + }, + { + "epoch": 0.4545833979274241, + "grad_norm": 0.14133617281913757, + "learning_rate": 0.001, + "loss": 2.1682, + "num_input_tokens_seen": 24978238080, + "step": 47650 + }, + { + "epoch": 0.45506040044361235, + "grad_norm": 0.14507652819156647, + "learning_rate": 0.001, + "loss": 2.1717, + "num_input_tokens_seen": 25004442496, + "step": 47700 + }, + { + "epoch": 0.4555374029598006, + "grad_norm": 0.1635296642780304, + "learning_rate": 0.001, + "loss": 2.1722, + "num_input_tokens_seen": 25030655840, + "step": 47750 + }, + { + "epoch": 0.45601440547598887, + "grad_norm": 0.15049296617507935, + "learning_rate": 0.001, + "loss": 2.1647, + "num_input_tokens_seen": 25056870240, + "step": 47800 + }, + { + "epoch": 0.45649140799217713, + "grad_norm": 0.14016319811344147, + "learning_rate": 0.001, + "loss": 2.3042, + "num_input_tokens_seen": 25083083712, + "step": 47850 + }, + { + "epoch": 0.45696841050836545, + "grad_norm": 0.1369781345129013, + "learning_rate": 0.001, + "loss": 2.21, + "num_input_tokens_seen": 25109294720, + "step": 47900 + }, + { + "epoch": 0.4574454130245537, + "grad_norm": 0.13268031179904938, + "learning_rate": 0.001, + "loss": 2.1809, + "num_input_tokens_seen": 25135504256, + "step": 47950 + }, + { + "epoch": 0.457922415540742, + "grad_norm": 0.13591749966144562, + "learning_rate": 0.001, + "loss": 2.1808, + "num_input_tokens_seen": 25161718656, + "step": 48000 + }, + { + "epoch": 0.457922415540742, + "eval_loss": 2.0938363075256348, + "eval_runtime": 81.9703, + "eval_samples_per_second": 60.998, + "eval_steps_per_second": 15.249, + "num_input_tokens_seen": 25161718656, + "step": 48000 + }, + { + "epoch": 0.45839941805693024, + "grad_norm": 0.13940733671188354, + "learning_rate": 0.001, + "loss": 2.174, + "num_input_tokens_seen": 25187922848, + "step": 48050 + }, + { + "epoch": 0.4588764205731185, + "grad_norm": 0.16502974927425385, + "learning_rate": 0.001, + "loss": 2.1807, + "num_input_tokens_seen": 25214132864, + "step": 48100 + }, + { + "epoch": 0.4593534230893068, + "grad_norm": 0.15250737965106964, + "learning_rate": 0.001, + "loss": 2.1831, + "num_input_tokens_seen": 25240339520, + "step": 48150 + }, + { + "epoch": 0.4598304256054951, + "grad_norm": 0.14336740970611572, + "learning_rate": 0.001, + "loss": 2.175, + "num_input_tokens_seen": 25266553920, + "step": 48200 + }, + { + "epoch": 0.46030742812168335, + "grad_norm": 0.1376286745071411, + "learning_rate": 0.001, + "loss": 2.1733, + "num_input_tokens_seen": 25292766560, + "step": 48250 + }, + { + "epoch": 0.4607844306378716, + "grad_norm": 0.1339864432811737, + "learning_rate": 0.001, + "loss": 2.1667, + "num_input_tokens_seen": 25318970496, + "step": 48300 + }, + { + "epoch": 0.4612614331540599, + "grad_norm": 0.14675366878509521, + "learning_rate": 0.001, + "loss": 2.1784, + "num_input_tokens_seen": 25345180512, + "step": 48350 + }, + { + "epoch": 0.46173843567024814, + "grad_norm": 0.14352139830589294, + "learning_rate": 0.001, + "loss": 2.1915, + "num_input_tokens_seen": 25371386368, + "step": 48400 + }, + { + "epoch": 0.46221543818643646, + "grad_norm": 0.14589083194732666, + "learning_rate": 0.001, + "loss": 2.1692, + "num_input_tokens_seen": 25397588192, + "step": 48450 + }, + { + "epoch": 0.4626924407026247, + "grad_norm": 0.1392335146665573, + "learning_rate": 0.001, + "loss": 2.1811, + "num_input_tokens_seen": 25423801984, + "step": 48500 + }, + { + "epoch": 0.4626924407026247, + "eval_loss": 2.0870039463043213, + "eval_runtime": 82.4574, + "eval_samples_per_second": 60.637, + "eval_steps_per_second": 15.159, + "num_input_tokens_seen": 25423801984, + "step": 48500 + }, + { + "epoch": 0.463169443218813, + "grad_norm": 0.14096789062023163, + "learning_rate": 0.001, + "loss": 2.1822, + "num_input_tokens_seen": 25450016384, + "step": 48550 + }, + { + "epoch": 0.46364644573500124, + "grad_norm": 0.13657501339912415, + "learning_rate": 0.001, + "loss": 2.1633, + "num_input_tokens_seen": 25476223712, + "step": 48600 + }, + { + "epoch": 0.4641234482511895, + "grad_norm": 0.1375761330127716, + "learning_rate": 0.001, + "loss": 2.1601, + "num_input_tokens_seen": 25502435136, + "step": 48650 + }, + { + "epoch": 0.46460045076737777, + "grad_norm": 0.13810068368911743, + "learning_rate": 0.001, + "loss": 2.1651, + "num_input_tokens_seen": 25528648192, + "step": 48700 + }, + { + "epoch": 0.4650774532835661, + "grad_norm": 0.1375926285982132, + "learning_rate": 0.001, + "loss": 2.1766, + "num_input_tokens_seen": 25554860256, + "step": 48750 + }, + { + "epoch": 0.46555445579975435, + "grad_norm": 0.14654815196990967, + "learning_rate": 0.001, + "loss": 2.1634, + "num_input_tokens_seen": 25581068864, + "step": 48800 + }, + { + "epoch": 0.4660314583159426, + "grad_norm": 0.1339625120162964, + "learning_rate": 0.001, + "loss": 2.1681, + "num_input_tokens_seen": 25607278112, + "step": 48850 + }, + { + "epoch": 0.4665084608321309, + "grad_norm": 0.13390694558620453, + "learning_rate": 0.001, + "loss": 2.1789, + "num_input_tokens_seen": 25633491968, + "step": 48900 + }, + { + "epoch": 0.46698546334831914, + "grad_norm": 0.14397822320461273, + "learning_rate": 0.001, + "loss": 2.1525, + "num_input_tokens_seen": 25659705568, + "step": 48950 + }, + { + "epoch": 0.46746246586450746, + "grad_norm": 0.12739968299865723, + "learning_rate": 0.001, + "loss": 2.1621, + "num_input_tokens_seen": 25685912544, + "step": 49000 + }, + { + "epoch": 0.46746246586450746, + "eval_loss": 2.0851972103118896, + "eval_runtime": 82.4678, + "eval_samples_per_second": 60.63, + "eval_steps_per_second": 15.157, + "num_input_tokens_seen": 25685912544, + "step": 49000 + }, + { + "epoch": 0.4679394683806957, + "grad_norm": 0.14692357182502747, + "learning_rate": 0.001, + "loss": 2.1643, + "num_input_tokens_seen": 25712121760, + "step": 49050 + }, + { + "epoch": 0.468416470896884, + "grad_norm": 0.13649721443653107, + "learning_rate": 0.001, + "loss": 2.1685, + "num_input_tokens_seen": 25738335680, + "step": 49100 + }, + { + "epoch": 0.46889347341307225, + "grad_norm": 0.1307746022939682, + "learning_rate": 0.001, + "loss": 2.1742, + "num_input_tokens_seen": 25764544640, + "step": 49150 + }, + { + "epoch": 0.4693704759292605, + "grad_norm": 0.1445266157388687, + "learning_rate": 0.001, + "loss": 2.1758, + "num_input_tokens_seen": 25790759040, + "step": 49200 + }, + { + "epoch": 0.4698474784454488, + "grad_norm": 0.14383389055728912, + "learning_rate": 0.001, + "loss": 2.1634, + "num_input_tokens_seen": 25816971648, + "step": 49250 + }, + { + "epoch": 0.4703244809616371, + "grad_norm": 0.1344735324382782, + "learning_rate": 0.001, + "loss": 2.1748, + "num_input_tokens_seen": 25843179040, + "step": 49300 + }, + { + "epoch": 0.47080148347782536, + "grad_norm": 0.1436392366886139, + "learning_rate": 0.001, + "loss": 2.1739, + "num_input_tokens_seen": 25869388544, + "step": 49350 + }, + { + "epoch": 0.4712784859940136, + "grad_norm": 0.14839567244052887, + "learning_rate": 0.001, + "loss": 2.1645, + "num_input_tokens_seen": 25895600800, + "step": 49400 + }, + { + "epoch": 0.4717554885102019, + "grad_norm": 0.14000116288661957, + "learning_rate": 0.001, + "loss": 2.1754, + "num_input_tokens_seen": 25921808160, + "step": 49450 + }, + { + "epoch": 0.47223249102639014, + "grad_norm": 0.1437309980392456, + "learning_rate": 0.001, + "loss": 2.1722, + "num_input_tokens_seen": 25948022560, + "step": 49500 + }, + { + "epoch": 0.47223249102639014, + "eval_loss": 2.083247184753418, + "eval_runtime": 82.7627, + "eval_samples_per_second": 60.414, + "eval_steps_per_second": 15.103, + "num_input_tokens_seen": 25948022560, + "step": 49500 + }, + { + "epoch": 0.47270949354257846, + "grad_norm": 0.14076103270053864, + "learning_rate": 0.001, + "loss": 2.1761, + "num_input_tokens_seen": 25974234496, + "step": 49550 + }, + { + "epoch": 0.4731864960587667, + "grad_norm": 0.13715969026088715, + "learning_rate": 0.001, + "loss": 2.1728, + "num_input_tokens_seen": 26000448896, + "step": 49600 + }, + { + "epoch": 0.473663498574955, + "grad_norm": 0.14823545515537262, + "learning_rate": 0.001, + "loss": 2.1646, + "num_input_tokens_seen": 26026663296, + "step": 49650 + }, + { + "epoch": 0.47414050109114325, + "grad_norm": 0.1491384655237198, + "learning_rate": 0.001, + "loss": 2.1674, + "num_input_tokens_seen": 26052869728, + "step": 49700 + }, + { + "epoch": 0.4746175036073315, + "grad_norm": 0.13799893856048584, + "learning_rate": 0.001, + "loss": 2.1577, + "num_input_tokens_seen": 26079080768, + "step": 49750 + }, + { + "epoch": 0.4750945061235198, + "grad_norm": 0.1610012948513031, + "learning_rate": 0.001, + "loss": 2.1546, + "num_input_tokens_seen": 26105295168, + "step": 49800 + }, + { + "epoch": 0.4755715086397081, + "grad_norm": 0.13887785375118256, + "learning_rate": 0.001, + "loss": 2.1642, + "num_input_tokens_seen": 26131495392, + "step": 49850 + }, + { + "epoch": 0.47604851115589636, + "grad_norm": 0.14724285900592804, + "learning_rate": 0.001, + "loss": 2.167, + "num_input_tokens_seen": 26157704320, + "step": 49900 + }, + { + "epoch": 0.4765255136720846, + "grad_norm": 0.13931205868721008, + "learning_rate": 0.001, + "loss": 2.1665, + "num_input_tokens_seen": 26183918720, + "step": 49950 + }, + { + "epoch": 0.4770025161882729, + "grad_norm": 0.13653016090393066, + "learning_rate": 0.001, + "loss": 2.1745, + "num_input_tokens_seen": 26210133120, + "step": 50000 + }, + { + "epoch": 0.4770025161882729, + "eval_loss": 2.0823795795440674, + "eval_runtime": 82.2807, + "eval_samples_per_second": 60.768, + "eval_steps_per_second": 15.192, + "num_input_tokens_seen": 26210133120, + "step": 50000 + }, + { + "epoch": 0.47747951870446115, + "grad_norm": 0.14365418255329132, + "learning_rate": 0.001, + "loss": 2.1567, + "num_input_tokens_seen": 26236339488, + "step": 50050 + }, + { + "epoch": 0.47795652122064947, + "grad_norm": 0.1386982500553131, + "learning_rate": 0.001, + "loss": 2.158, + "num_input_tokens_seen": 26262549248, + "step": 50100 + }, + { + "epoch": 0.47843352373683773, + "grad_norm": 0.1469505876302719, + "learning_rate": 0.001, + "loss": 2.1654, + "num_input_tokens_seen": 26288757120, + "step": 50150 + }, + { + "epoch": 0.478910526253026, + "grad_norm": 0.1320936232805252, + "learning_rate": 0.001, + "loss": 2.167, + "num_input_tokens_seen": 26314968160, + "step": 50200 + }, + { + "epoch": 0.47938752876921426, + "grad_norm": 0.14790290594100952, + "learning_rate": 0.001, + "loss": 2.1601, + "num_input_tokens_seen": 26341180480, + "step": 50250 + }, + { + "epoch": 0.4798645312854025, + "grad_norm": 0.14135821163654327, + "learning_rate": 0.001, + "loss": 2.1659, + "num_input_tokens_seen": 26367381728, + "step": 50300 + }, + { + "epoch": 0.4803415338015908, + "grad_norm": 0.13028793036937714, + "learning_rate": 0.001, + "loss": 2.1619, + "num_input_tokens_seen": 26393594272, + "step": 50350 + }, + { + "epoch": 0.4808185363177791, + "grad_norm": 0.16743403673171997, + "learning_rate": 0.001, + "loss": 2.1674, + "num_input_tokens_seen": 26419802944, + "step": 50400 + }, + { + "epoch": 0.48129553883396736, + "grad_norm": 0.145367830991745, + "learning_rate": 0.001, + "loss": 2.1681, + "num_input_tokens_seen": 26446014976, + "step": 50450 + }, + { + "epoch": 0.4817725413501556, + "grad_norm": 0.149298757314682, + "learning_rate": 0.001, + "loss": 2.1529, + "num_input_tokens_seen": 26472227840, + "step": 50500 + }, + { + "epoch": 0.4817725413501556, + "eval_loss": 2.0811688899993896, + "eval_runtime": 80.0431, + "eval_samples_per_second": 62.466, + "eval_steps_per_second": 15.617, + "num_input_tokens_seen": 26472227840, + "step": 50500 + }, + { + "epoch": 0.4822495438663439, + "grad_norm": 0.1397143453359604, + "learning_rate": 0.001, + "loss": 2.1683, + "num_input_tokens_seen": 26498441728, + "step": 50550 + }, + { + "epoch": 0.48272654638253215, + "grad_norm": 0.14634068310260773, + "learning_rate": 0.001, + "loss": 2.1586, + "num_input_tokens_seen": 26524650432, + "step": 50600 + }, + { + "epoch": 0.4832035488987204, + "grad_norm": 0.15363429486751556, + "learning_rate": 0.001, + "loss": 2.1711, + "num_input_tokens_seen": 26550864832, + "step": 50650 + }, + { + "epoch": 0.48368055141490873, + "grad_norm": 0.15214493870735168, + "learning_rate": 0.001, + "loss": 2.1623, + "num_input_tokens_seen": 26577068512, + "step": 50700 + }, + { + "epoch": 0.484157553931097, + "grad_norm": 0.14321520924568176, + "learning_rate": 0.001, + "loss": 2.1749, + "num_input_tokens_seen": 26603282368, + "step": 50750 + }, + { + "epoch": 0.48463455644728526, + "grad_norm": 0.15269018709659576, + "learning_rate": 0.001, + "loss": 2.166, + "num_input_tokens_seen": 26629493472, + "step": 50800 + }, + { + "epoch": 0.4851115589634735, + "grad_norm": 0.1434074342250824, + "learning_rate": 0.001, + "loss": 2.157, + "num_input_tokens_seen": 26655707872, + "step": 50850 + }, + { + "epoch": 0.4855885614796618, + "grad_norm": 0.1321389526128769, + "learning_rate": 0.001, + "loss": 2.1604, + "num_input_tokens_seen": 26681921088, + "step": 50900 + }, + { + "epoch": 0.4860655639958501, + "grad_norm": 0.1456880420446396, + "learning_rate": 0.001, + "loss": 2.1602, + "num_input_tokens_seen": 26708125664, + "step": 50950 + }, + { + "epoch": 0.48654256651203837, + "grad_norm": 0.1457262486219406, + "learning_rate": 0.001, + "loss": 2.169, + "num_input_tokens_seen": 26734340064, + "step": 51000 + }, + { + "epoch": 0.48654256651203837, + "eval_loss": 2.08145809173584, + "eval_runtime": 80.0702, + "eval_samples_per_second": 62.445, + "eval_steps_per_second": 15.611, + "num_input_tokens_seen": 26734340064, + "step": 51000 + }, + { + "epoch": 0.48701956902822663, + "grad_norm": 0.13553930819034576, + "learning_rate": 0.001, + "loss": 2.1677, + "num_input_tokens_seen": 26760554464, + "step": 51050 + }, + { + "epoch": 0.4874965715444149, + "grad_norm": 0.15178151428699493, + "learning_rate": 0.001, + "loss": 2.1729, + "num_input_tokens_seen": 26786768096, + "step": 51100 + }, + { + "epoch": 0.48797357406060315, + "grad_norm": 0.14045366644859314, + "learning_rate": 0.001, + "loss": 2.1808, + "num_input_tokens_seen": 26812978944, + "step": 51150 + }, + { + "epoch": 0.4884505765767914, + "grad_norm": 0.1409856528043747, + "learning_rate": 0.001, + "loss": 2.1614, + "num_input_tokens_seen": 26839186816, + "step": 51200 + }, + { + "epoch": 0.48892757909297974, + "grad_norm": 0.16543184220790863, + "learning_rate": 0.001, + "loss": 2.1763, + "num_input_tokens_seen": 26865386144, + "step": 51250 + }, + { + "epoch": 0.489404581609168, + "grad_norm": 0.14048989117145538, + "learning_rate": 0.001, + "loss": 2.18, + "num_input_tokens_seen": 26891600064, + "step": 51300 + }, + { + "epoch": 0.48988158412535626, + "grad_norm": 0.13343140482902527, + "learning_rate": 0.001, + "loss": 2.1598, + "num_input_tokens_seen": 26917798688, + "step": 51350 + }, + { + "epoch": 0.4903585866415445, + "grad_norm": 0.1373709738254547, + "learning_rate": 0.001, + "loss": 2.1646, + "num_input_tokens_seen": 26944004160, + "step": 51400 + }, + { + "epoch": 0.4908355891577328, + "grad_norm": 0.14587919414043427, + "learning_rate": 0.001, + "loss": 2.1611, + "num_input_tokens_seen": 26970218560, + "step": 51450 + }, + { + "epoch": 0.4913125916739211, + "grad_norm": 0.1413598656654358, + "learning_rate": 0.001, + "loss": 2.1738, + "num_input_tokens_seen": 26996432960, + "step": 51500 + }, + { + "epoch": 0.4913125916739211, + "eval_loss": 2.0796499252319336, + "eval_runtime": 80.2566, + "eval_samples_per_second": 62.3, + "eval_steps_per_second": 15.575, + "num_input_tokens_seen": 26996432960, + "step": 51500 + }, + { + "epoch": 0.49178959419010937, + "grad_norm": 0.12603874504566193, + "learning_rate": 0.001, + "loss": 2.1487, + "num_input_tokens_seen": 27022644512, + "step": 51550 + }, + { + "epoch": 0.49226659670629763, + "grad_norm": 0.15144561231136322, + "learning_rate": 0.001, + "loss": 2.1619, + "num_input_tokens_seen": 27048858912, + "step": 51600 + }, + { + "epoch": 0.4927435992224859, + "grad_norm": 0.15037000179290771, + "learning_rate": 0.001, + "loss": 2.1639, + "num_input_tokens_seen": 27075051872, + "step": 51650 + }, + { + "epoch": 0.49322060173867416, + "grad_norm": 0.14638635516166687, + "learning_rate": 0.001, + "loss": 2.1688, + "num_input_tokens_seen": 27101261408, + "step": 51700 + }, + { + "epoch": 0.4936976042548624, + "grad_norm": 0.1408969908952713, + "learning_rate": 0.001, + "loss": 2.1768, + "num_input_tokens_seen": 27127465824, + "step": 51750 + }, + { + "epoch": 0.49417460677105074, + "grad_norm": 0.15393120050430298, + "learning_rate": 0.001, + "loss": 2.1719, + "num_input_tokens_seen": 27153675296, + "step": 51800 + }, + { + "epoch": 0.494651609287239, + "grad_norm": 0.13638852536678314, + "learning_rate": 0.001, + "loss": 2.1553, + "num_input_tokens_seen": 27179888192, + "step": 51850 + }, + { + "epoch": 0.49512861180342727, + "grad_norm": 0.1885574907064438, + "learning_rate": 0.001, + "loss": 2.1764, + "num_input_tokens_seen": 27206102592, + "step": 51900 + }, + { + "epoch": 0.49560561431961553, + "grad_norm": 0.15341401100158691, + "learning_rate": 0.001, + "loss": 2.172, + "num_input_tokens_seen": 27232311776, + "step": 51950 + }, + { + "epoch": 0.4960826168358038, + "grad_norm": 0.17189666628837585, + "learning_rate": 0.001, + "loss": 2.169, + "num_input_tokens_seen": 27258524544, + "step": 52000 + }, + { + "epoch": 0.4960826168358038, + "eval_loss": 2.080231189727783, + "eval_runtime": 80.3516, + "eval_samples_per_second": 62.227, + "eval_steps_per_second": 15.557, + "num_input_tokens_seen": 27258524544, + "step": 52000 + }, + { + "epoch": 0.49655961935199205, + "grad_norm": 0.14001034200191498, + "learning_rate": 0.001, + "loss": 2.164, + "num_input_tokens_seen": 27284732256, + "step": 52050 + }, + { + "epoch": 0.4970366218681804, + "grad_norm": 0.16195432841777802, + "learning_rate": 0.001, + "loss": 2.1655, + "num_input_tokens_seen": 27310934944, + "step": 52100 + }, + { + "epoch": 0.49751362438436864, + "grad_norm": 0.14236243069171906, + "learning_rate": 0.001, + "loss": 2.1629, + "num_input_tokens_seen": 27337149344, + "step": 52150 + }, + { + "epoch": 0.4979906269005569, + "grad_norm": 0.13297200202941895, + "learning_rate": 0.001, + "loss": 2.1536, + "num_input_tokens_seen": 27363363744, + "step": 52200 + }, + { + "epoch": 0.49846762941674516, + "grad_norm": 0.13531994819641113, + "learning_rate": 0.001, + "loss": 2.1595, + "num_input_tokens_seen": 27389575584, + "step": 52250 + }, + { + "epoch": 0.4989446319329334, + "grad_norm": 0.13988707959651947, + "learning_rate": 0.001, + "loss": 2.1727, + "num_input_tokens_seen": 27415789984, + "step": 52300 + }, + { + "epoch": 0.49942163444912174, + "grad_norm": 0.151968851685524, + "learning_rate": 0.001, + "loss": 2.1668, + "num_input_tokens_seen": 27442001344, + "step": 52350 + }, + { + "epoch": 0.49989863696531, + "grad_norm": 0.12845058739185333, + "learning_rate": 0.001, + "loss": 2.1681, + "num_input_tokens_seen": 27468215456, + "step": 52400 + }, + { + "epoch": 0.5003756394814982, + "grad_norm": 0.14046697318553925, + "learning_rate": 0.001, + "loss": 2.1608, + "num_input_tokens_seen": 27494423232, + "step": 52450 + }, + { + "epoch": 0.5008526419976865, + "grad_norm": 0.147069051861763, + "learning_rate": 0.001, + "loss": 2.1557, + "num_input_tokens_seen": 27520636736, + "step": 52500 + }, + { + "epoch": 0.5008526419976865, + "eval_loss": 2.077626943588257, + "eval_runtime": 80.4239, + "eval_samples_per_second": 62.171, + "eval_steps_per_second": 15.543, + "num_input_tokens_seen": 27520636736, + "step": 52500 + }, + { + "epoch": 0.5013296445138749, + "grad_norm": 0.14614522457122803, + "learning_rate": 0.001, + "loss": 2.1555, + "num_input_tokens_seen": 27546844160, + "step": 52550 + }, + { + "epoch": 0.5018066470300631, + "grad_norm": 0.15690810978412628, + "learning_rate": 0.001, + "loss": 2.1626, + "num_input_tokens_seen": 27573057792, + "step": 52600 + }, + { + "epoch": 0.5022836495462514, + "grad_norm": 0.1412731409072876, + "learning_rate": 0.001, + "loss": 2.1639, + "num_input_tokens_seen": 27599266240, + "step": 52650 + }, + { + "epoch": 0.5027606520624396, + "grad_norm": 0.15567494928836823, + "learning_rate": 0.001, + "loss": 2.1606, + "num_input_tokens_seen": 27625477504, + "step": 52700 + }, + { + "epoch": 0.5032376545786279, + "grad_norm": 0.13818155229091644, + "learning_rate": 0.001, + "loss": 2.1655, + "num_input_tokens_seen": 27651687680, + "step": 52750 + }, + { + "epoch": 0.5037146570948162, + "grad_norm": 0.15351204574108124, + "learning_rate": 0.001, + "loss": 2.1599, + "num_input_tokens_seen": 27677893760, + "step": 52800 + }, + { + "epoch": 0.5041916596110044, + "grad_norm": 0.1560334414243698, + "learning_rate": 0.001, + "loss": 2.1629, + "num_input_tokens_seen": 27704100960, + "step": 52850 + }, + { + "epoch": 0.5046686621271927, + "grad_norm": 0.1419985294342041, + "learning_rate": 0.001, + "loss": 2.1697, + "num_input_tokens_seen": 27730315136, + "step": 52900 + }, + { + "epoch": 0.505145664643381, + "grad_norm": 0.16582848131656647, + "learning_rate": 0.001, + "loss": 2.1592, + "num_input_tokens_seen": 27756524992, + "step": 52950 + }, + { + "epoch": 0.5056226671595693, + "grad_norm": 0.14559602737426758, + "learning_rate": 0.001, + "loss": 2.1765, + "num_input_tokens_seen": 27782732608, + "step": 53000 + }, + { + "epoch": 0.5056226671595693, + "eval_loss": 2.082807779312134, + "eval_runtime": 79.9388, + "eval_samples_per_second": 62.548, + "eval_steps_per_second": 15.637, + "num_input_tokens_seen": 27782732608, + "step": 53000 + }, + { + "epoch": 0.5060996696757576, + "grad_norm": 0.1368139386177063, + "learning_rate": 0.001, + "loss": 2.1633, + "num_input_tokens_seen": 27808946304, + "step": 53050 + }, + { + "epoch": 0.5065766721919458, + "grad_norm": 0.13983768224716187, + "learning_rate": 0.001, + "loss": 2.1664, + "num_input_tokens_seen": 27835155744, + "step": 53100 + }, + { + "epoch": 0.5070536747081341, + "grad_norm": 0.14930413663387299, + "learning_rate": 0.001, + "loss": 2.1593, + "num_input_tokens_seen": 27861367328, + "step": 53150 + }, + { + "epoch": 0.5075306772243223, + "grad_norm": 0.1432899385690689, + "learning_rate": 0.001, + "loss": 2.16, + "num_input_tokens_seen": 27887572224, + "step": 53200 + }, + { + "epoch": 0.5080076797405106, + "grad_norm": 0.1435759961605072, + "learning_rate": 0.001, + "loss": 2.1647, + "num_input_tokens_seen": 27913770912, + "step": 53250 + }, + { + "epoch": 0.5084846822566989, + "grad_norm": 0.14046362042427063, + "learning_rate": 0.001, + "loss": 2.1631, + "num_input_tokens_seen": 27939983648, + "step": 53300 + }, + { + "epoch": 0.5089616847728872, + "grad_norm": 0.14235271513462067, + "learning_rate": 0.001, + "loss": 2.1594, + "num_input_tokens_seen": 27966198048, + "step": 53350 + }, + { + "epoch": 0.5094386872890755, + "grad_norm": 0.14583303034305573, + "learning_rate": 0.001, + "loss": 2.1593, + "num_input_tokens_seen": 27992412448, + "step": 53400 + }, + { + "epoch": 0.5099156898052637, + "grad_norm": 0.1448000818490982, + "learning_rate": 0.001, + "loss": 2.1634, + "num_input_tokens_seen": 28018625792, + "step": 53450 + }, + { + "epoch": 0.510392692321452, + "grad_norm": 0.1414560228586197, + "learning_rate": 0.001, + "loss": 2.1616, + "num_input_tokens_seen": 28044839456, + "step": 53500 + }, + { + "epoch": 0.510392692321452, + "eval_loss": 2.076679229736328, + "eval_runtime": 80.0345, + "eval_samples_per_second": 62.473, + "eval_steps_per_second": 15.618, + "num_input_tokens_seen": 28044839456, + "step": 53500 + }, + { + "epoch": 0.5108696948376402, + "grad_norm": 0.13138248026371002, + "learning_rate": 0.001, + "loss": 2.1597, + "num_input_tokens_seen": 28071046464, + "step": 53550 + }, + { + "epoch": 0.5113466973538285, + "grad_norm": 0.14930233359336853, + "learning_rate": 0.001, + "loss": 2.1546, + "num_input_tokens_seen": 28097258400, + "step": 53600 + }, + { + "epoch": 0.5118236998700169, + "grad_norm": 0.15363110601902008, + "learning_rate": 0.001, + "loss": 2.146, + "num_input_tokens_seen": 28123467136, + "step": 53650 + }, + { + "epoch": 0.5123007023862051, + "grad_norm": 0.13691812753677368, + "learning_rate": 0.001, + "loss": 2.1597, + "num_input_tokens_seen": 28149680096, + "step": 53700 + }, + { + "epoch": 0.5127777049023934, + "grad_norm": 0.14878015220165253, + "learning_rate": 0.001, + "loss": 2.154, + "num_input_tokens_seen": 28175888256, + "step": 53750 + }, + { + "epoch": 0.5132547074185816, + "grad_norm": 0.1334819197654724, + "learning_rate": 0.001, + "loss": 2.1518, + "num_input_tokens_seen": 28202098624, + "step": 53800 + }, + { + "epoch": 0.5137317099347699, + "grad_norm": 0.14382654428482056, + "learning_rate": 0.001, + "loss": 2.1755, + "num_input_tokens_seen": 28228306496, + "step": 53850 + }, + { + "epoch": 0.5142087124509582, + "grad_norm": 0.13571012020111084, + "learning_rate": 0.001, + "loss": 2.1601, + "num_input_tokens_seen": 28254520640, + "step": 53900 + }, + { + "epoch": 0.5146857149671464, + "grad_norm": 0.13496848940849304, + "learning_rate": 0.001, + "loss": 2.1635, + "num_input_tokens_seen": 28280731968, + "step": 53950 + }, + { + "epoch": 0.5151627174833348, + "grad_norm": 0.13804535567760468, + "learning_rate": 0.001, + "loss": 2.1569, + "num_input_tokens_seen": 28306946368, + "step": 54000 + }, + { + "epoch": 0.5151627174833348, + "eval_loss": 2.075817108154297, + "eval_runtime": 80.0868, + "eval_samples_per_second": 62.432, + "eval_steps_per_second": 15.608, + "num_input_tokens_seen": 28306946368, + "step": 54000 + }, + { + "epoch": 0.515639719999523, + "grad_norm": 0.13443215191364288, + "learning_rate": 0.001, + "loss": 2.1567, + "num_input_tokens_seen": 28333154944, + "step": 54050 + }, + { + "epoch": 0.5161167225157113, + "grad_norm": 0.141039177775383, + "learning_rate": 0.001, + "loss": 2.1683, + "num_input_tokens_seen": 28359365504, + "step": 54100 + }, + { + "epoch": 0.5165937250318996, + "grad_norm": 0.14250704646110535, + "learning_rate": 0.001, + "loss": 2.1604, + "num_input_tokens_seen": 28385573376, + "step": 54150 + }, + { + "epoch": 0.5170707275480878, + "grad_norm": 0.14478139579296112, + "learning_rate": 0.001, + "loss": 2.1557, + "num_input_tokens_seen": 28411779968, + "step": 54200 + }, + { + "epoch": 0.5175477300642761, + "grad_norm": 0.14316383004188538, + "learning_rate": 0.001, + "loss": 2.1604, + "num_input_tokens_seen": 28437988288, + "step": 54250 + }, + { + "epoch": 0.5180247325804643, + "grad_norm": 0.15016962587833405, + "learning_rate": 0.001, + "loss": 2.1591, + "num_input_tokens_seen": 28464201280, + "step": 54300 + }, + { + "epoch": 0.5185017350966526, + "grad_norm": 0.1401468962430954, + "learning_rate": 0.001, + "loss": 2.1483, + "num_input_tokens_seen": 28490409024, + "step": 54350 + }, + { + "epoch": 0.5189787376128409, + "grad_norm": 0.14955569803714752, + "learning_rate": 0.001, + "loss": 2.163, + "num_input_tokens_seen": 28516622176, + "step": 54400 + }, + { + "epoch": 0.5194557401290292, + "grad_norm": 0.1313570886850357, + "learning_rate": 0.001, + "loss": 2.1526, + "num_input_tokens_seen": 28542833568, + "step": 54450 + }, + { + "epoch": 0.5199327426452175, + "grad_norm": 0.15107598900794983, + "learning_rate": 0.001, + "loss": 2.1561, + "num_input_tokens_seen": 28569047936, + "step": 54500 + }, + { + "epoch": 0.5199327426452175, + "eval_loss": 2.074584484100342, + "eval_runtime": 80.0388, + "eval_samples_per_second": 62.47, + "eval_steps_per_second": 15.617, + "num_input_tokens_seen": 28569047936, + "step": 54500 + }, + { + "epoch": 0.5204097451614057, + "grad_norm": 0.14131924510002136, + "learning_rate": 0.001, + "loss": 2.1566, + "num_input_tokens_seen": 28595252544, + "step": 54550 + }, + { + "epoch": 0.520886747677594, + "grad_norm": 0.14203251898288727, + "learning_rate": 0.001, + "loss": 2.1472, + "num_input_tokens_seen": 28621463264, + "step": 54600 + }, + { + "epoch": 0.5213637501937822, + "grad_norm": 0.1641647219657898, + "learning_rate": 0.001, + "loss": 2.1512, + "num_input_tokens_seen": 28647667552, + "step": 54650 + }, + { + "epoch": 0.5218407527099705, + "grad_norm": 0.148850217461586, + "learning_rate": 0.001, + "loss": 2.1577, + "num_input_tokens_seen": 28673875968, + "step": 54700 + }, + { + "epoch": 0.5223177552261589, + "grad_norm": 0.1406261920928955, + "learning_rate": 0.001, + "loss": 2.1597, + "num_input_tokens_seen": 28700090368, + "step": 54750 + }, + { + "epoch": 0.5227947577423471, + "grad_norm": 0.1583367884159088, + "learning_rate": 0.001, + "loss": 2.1712, + "num_input_tokens_seen": 28726303904, + "step": 54800 + }, + { + "epoch": 0.5232717602585354, + "grad_norm": 0.1432129442691803, + "learning_rate": 0.001, + "loss": 2.1565, + "num_input_tokens_seen": 28752517600, + "step": 54850 + }, + { + "epoch": 0.5237487627747236, + "grad_norm": 0.1418701410293579, + "learning_rate": 0.001, + "loss": 2.1515, + "num_input_tokens_seen": 28778730016, + "step": 54900 + }, + { + "epoch": 0.5242257652909119, + "grad_norm": 0.14983917772769928, + "learning_rate": 0.001, + "loss": 2.1529, + "num_input_tokens_seen": 28804938496, + "step": 54950 + }, + { + "epoch": 0.5247027678071002, + "grad_norm": 0.13952849805355072, + "learning_rate": 0.001, + "loss": 2.1554, + "num_input_tokens_seen": 28831152896, + "step": 55000 + }, + { + "epoch": 0.5247027678071002, + "eval_loss": 2.072495222091675, + "eval_runtime": 80.1672, + "eval_samples_per_second": 62.37, + "eval_steps_per_second": 15.592, + "num_input_tokens_seen": 28831152896, + "step": 55000 + }, + { + "epoch": 0.5251797703232884, + "grad_norm": 0.15341860055923462, + "learning_rate": 0.001, + "loss": 2.1638, + "num_input_tokens_seen": 28857367296, + "step": 55050 + }, + { + "epoch": 0.5256567728394768, + "grad_norm": 0.1550397276878357, + "learning_rate": 0.001, + "loss": 2.1591, + "num_input_tokens_seen": 28883573728, + "step": 55100 + }, + { + "epoch": 0.526133775355665, + "grad_norm": 0.13328562676906586, + "learning_rate": 0.001, + "loss": 2.1647, + "num_input_tokens_seen": 28909785952, + "step": 55150 + }, + { + "epoch": 0.5266107778718533, + "grad_norm": 0.14107167720794678, + "learning_rate": 0.001, + "loss": 2.1543, + "num_input_tokens_seen": 28935993792, + "step": 55200 + }, + { + "epoch": 0.5270877803880415, + "grad_norm": 0.13323615491390228, + "learning_rate": 0.001, + "loss": 2.154, + "num_input_tokens_seen": 28962207328, + "step": 55250 + }, + { + "epoch": 0.5275647829042298, + "grad_norm": 0.14103908836841583, + "learning_rate": 0.001, + "loss": 2.159, + "num_input_tokens_seen": 28988419424, + "step": 55300 + }, + { + "epoch": 0.5280417854204181, + "grad_norm": 0.14379121363162994, + "learning_rate": 0.001, + "loss": 2.1515, + "num_input_tokens_seen": 29014627904, + "step": 55350 + }, + { + "epoch": 0.5285187879366063, + "grad_norm": 0.14381948113441467, + "learning_rate": 0.001, + "loss": 2.142, + "num_input_tokens_seen": 29040837120, + "step": 55400 + }, + { + "epoch": 0.5289957904527947, + "grad_norm": 0.13829398155212402, + "learning_rate": 0.001, + "loss": 2.1573, + "num_input_tokens_seen": 29067049824, + "step": 55450 + }, + { + "epoch": 0.5294727929689829, + "grad_norm": 0.14373236894607544, + "learning_rate": 0.001, + "loss": 2.1505, + "num_input_tokens_seen": 29093257888, + "step": 55500 + }, + { + "epoch": 0.5294727929689829, + "eval_loss": 2.0716280937194824, + "eval_runtime": 80.497, + "eval_samples_per_second": 62.114, + "eval_steps_per_second": 15.529, + "num_input_tokens_seen": 29093257888, + "step": 55500 + }, + { + "epoch": 0.5299497954851712, + "grad_norm": 0.1435646265745163, + "learning_rate": 0.001, + "loss": 2.1534, + "num_input_tokens_seen": 29119472096, + "step": 55550 + }, + { + "epoch": 0.5304267980013595, + "grad_norm": 0.15286391973495483, + "learning_rate": 0.001, + "loss": 2.1538, + "num_input_tokens_seen": 29145685600, + "step": 55600 + }, + { + "epoch": 0.5309038005175477, + "grad_norm": 0.15763621032238007, + "learning_rate": 0.001, + "loss": 2.1598, + "num_input_tokens_seen": 29171894688, + "step": 55650 + }, + { + "epoch": 0.531380803033736, + "grad_norm": 0.17268946766853333, + "learning_rate": 0.001, + "loss": 2.1619, + "num_input_tokens_seen": 29198109088, + "step": 55700 + }, + { + "epoch": 0.5318578055499242, + "grad_norm": 0.14589810371398926, + "learning_rate": 0.001, + "loss": 2.1517, + "num_input_tokens_seen": 29224315616, + "step": 55750 + }, + { + "epoch": 0.5323348080661126, + "grad_norm": 0.14555124938488007, + "learning_rate": 0.001, + "loss": 2.1583, + "num_input_tokens_seen": 29250529568, + "step": 55800 + }, + { + "epoch": 0.5328118105823009, + "grad_norm": 0.15364859998226166, + "learning_rate": 0.001, + "loss": 2.1539, + "num_input_tokens_seen": 29276735264, + "step": 55850 + }, + { + "epoch": 0.5332888130984891, + "grad_norm": 0.14615200459957123, + "learning_rate": 0.001, + "loss": 2.1589, + "num_input_tokens_seen": 29302947520, + "step": 55900 + }, + { + "epoch": 0.5337658156146774, + "grad_norm": 0.13198421895503998, + "learning_rate": 0.001, + "loss": 2.1608, + "num_input_tokens_seen": 29329161920, + "step": 55950 + }, + { + "epoch": 0.5342428181308656, + "grad_norm": 0.1391836404800415, + "learning_rate": 0.001, + "loss": 2.1491, + "num_input_tokens_seen": 29355372320, + "step": 56000 + }, + { + "epoch": 0.5342428181308656, + "eval_loss": 2.0713729858398438, + "eval_runtime": 80.4159, + "eval_samples_per_second": 62.177, + "eval_steps_per_second": 15.544, + "num_input_tokens_seen": 29355372320, + "step": 56000 + }, + { + "epoch": 0.5347198206470539, + "grad_norm": 0.14797906577587128, + "learning_rate": 0.001, + "loss": 2.1577, + "num_input_tokens_seen": 29381576544, + "step": 56050 + }, + { + "epoch": 0.5351968231632422, + "grad_norm": 0.15340618789196014, + "learning_rate": 0.001, + "loss": 2.1528, + "num_input_tokens_seen": 29407783104, + "step": 56100 + }, + { + "epoch": 0.5356738256794304, + "grad_norm": 0.15017147362232208, + "learning_rate": 0.001, + "loss": 2.151, + "num_input_tokens_seen": 29433993216, + "step": 56150 + }, + { + "epoch": 0.5361508281956188, + "grad_norm": 0.13791312277317047, + "learning_rate": 0.001, + "loss": 2.1513, + "num_input_tokens_seen": 29460207616, + "step": 56200 + }, + { + "epoch": 0.536627830711807, + "grad_norm": 0.14975033700466156, + "learning_rate": 0.001, + "loss": 2.1624, + "num_input_tokens_seen": 29486420512, + "step": 56250 + }, + { + "epoch": 0.5371048332279953, + "grad_norm": 0.1503009796142578, + "learning_rate": 0.001, + "loss": 2.1662, + "num_input_tokens_seen": 29512633952, + "step": 56300 + }, + { + "epoch": 0.5375818357441835, + "grad_norm": 0.15859892964363098, + "learning_rate": 0.001, + "loss": 2.1643, + "num_input_tokens_seen": 29538847776, + "step": 56350 + }, + { + "epoch": 0.5380588382603718, + "grad_norm": 0.14404819905757904, + "learning_rate": 0.001, + "loss": 2.149, + "num_input_tokens_seen": 29565059040, + "step": 56400 + }, + { + "epoch": 0.5385358407765601, + "grad_norm": 0.14447428286075592, + "learning_rate": 0.001, + "loss": 2.1533, + "num_input_tokens_seen": 29591271488, + "step": 56450 + }, + { + "epoch": 0.5390128432927483, + "grad_norm": 0.1475590616464615, + "learning_rate": 0.001, + "loss": 2.1471, + "num_input_tokens_seen": 29617485024, + "step": 56500 + }, + { + "epoch": 0.5390128432927483, + "eval_loss": 2.070662498474121, + "eval_runtime": 80.5279, + "eval_samples_per_second": 62.09, + "eval_steps_per_second": 15.523, + "num_input_tokens_seen": 29617485024, + "step": 56500 + }, + { + "epoch": 0.5394898458089367, + "grad_norm": 0.15244145691394806, + "learning_rate": 0.001, + "loss": 2.1552, + "num_input_tokens_seen": 29643697696, + "step": 56550 + }, + { + "epoch": 0.5399668483251249, + "grad_norm": 0.1519034206867218, + "learning_rate": 0.001, + "loss": 2.1538, + "num_input_tokens_seen": 29669908640, + "step": 56600 + }, + { + "epoch": 0.5404438508413132, + "grad_norm": 0.14255867898464203, + "learning_rate": 0.001, + "loss": 2.1503, + "num_input_tokens_seen": 29696119200, + "step": 56650 + }, + { + "epoch": 0.5409208533575015, + "grad_norm": 0.13525427877902985, + "learning_rate": 0.001, + "loss": 2.161, + "num_input_tokens_seen": 29722325888, + "step": 56700 + }, + { + "epoch": 0.5413978558736897, + "grad_norm": 0.15784476697444916, + "learning_rate": 0.001, + "loss": 2.1473, + "num_input_tokens_seen": 29748536224, + "step": 56750 + }, + { + "epoch": 0.541874858389878, + "grad_norm": 0.1454872041940689, + "learning_rate": 0.001, + "loss": 2.1471, + "num_input_tokens_seen": 29774749824, + "step": 56800 + }, + { + "epoch": 0.5423518609060662, + "grad_norm": 0.1350981444120407, + "learning_rate": 0.001, + "loss": 2.1638, + "num_input_tokens_seen": 29800959392, + "step": 56850 + }, + { + "epoch": 0.5428288634222546, + "grad_norm": 0.13668446242809296, + "learning_rate": 0.001, + "loss": 2.1497, + "num_input_tokens_seen": 29827173792, + "step": 56900 + }, + { + "epoch": 0.5433058659384429, + "grad_norm": 0.14868319034576416, + "learning_rate": 0.001, + "loss": 2.1597, + "num_input_tokens_seen": 29853388192, + "step": 56950 + }, + { + "epoch": 0.5437828684546311, + "grad_norm": 0.15703202784061432, + "learning_rate": 0.001, + "loss": 2.1465, + "num_input_tokens_seen": 29879599072, + "step": 57000 + }, + { + "epoch": 0.5437828684546311, + "eval_loss": 2.0691609382629395, + "eval_runtime": 79.8554, + "eval_samples_per_second": 62.613, + "eval_steps_per_second": 15.653, + "num_input_tokens_seen": 29879599072, + "step": 57000 + }, + { + "epoch": 0.5442598709708194, + "grad_norm": 0.1478765606880188, + "learning_rate": 0.001, + "loss": 2.1537, + "num_input_tokens_seen": 29905809824, + "step": 57050 + }, + { + "epoch": 0.5447368734870076, + "grad_norm": 0.15318194031715393, + "learning_rate": 0.001, + "loss": 2.1605, + "num_input_tokens_seen": 29932019488, + "step": 57100 + }, + { + "epoch": 0.5452138760031959, + "grad_norm": 0.14850732684135437, + "learning_rate": 0.001, + "loss": 2.1562, + "num_input_tokens_seen": 29958231520, + "step": 57150 + }, + { + "epoch": 0.5456908785193841, + "grad_norm": 0.14641685783863068, + "learning_rate": 0.001, + "loss": 2.1582, + "num_input_tokens_seen": 29984433824, + "step": 57200 + }, + { + "epoch": 0.5461678810355725, + "grad_norm": 0.14056475460529327, + "learning_rate": 0.001, + "loss": 2.1609, + "num_input_tokens_seen": 30010648224, + "step": 57250 + }, + { + "epoch": 0.5466448835517608, + "grad_norm": 0.1431768536567688, + "learning_rate": 0.001, + "loss": 2.1562, + "num_input_tokens_seen": 30036862624, + "step": 57300 + }, + { + "epoch": 0.547121886067949, + "grad_norm": 0.13748180866241455, + "learning_rate": 0.001, + "loss": 2.1489, + "num_input_tokens_seen": 30063070240, + "step": 57350 + }, + { + "epoch": 0.5475988885841373, + "grad_norm": 0.1455860286951065, + "learning_rate": 0.001, + "loss": 2.1532, + "num_input_tokens_seen": 30089281728, + "step": 57400 + }, + { + "epoch": 0.5480758911003255, + "grad_norm": 0.13956403732299805, + "learning_rate": 0.001, + "loss": 2.1602, + "num_input_tokens_seen": 30115488128, + "step": 57450 + }, + { + "epoch": 0.5485528936165138, + "grad_norm": 0.13826127350330353, + "learning_rate": 0.001, + "loss": 2.1511, + "num_input_tokens_seen": 30141698752, + "step": 57500 + }, + { + "epoch": 0.5485528936165138, + "eval_loss": 2.068099021911621, + "eval_runtime": 80.8716, + "eval_samples_per_second": 61.826, + "eval_steps_per_second": 15.457, + "num_input_tokens_seen": 30141698752, + "step": 57500 + }, + { + "epoch": 0.5490298961327021, + "grad_norm": 0.1383499652147293, + "learning_rate": 0.001, + "loss": 2.1446, + "num_input_tokens_seen": 30167909792, + "step": 57550 + }, + { + "epoch": 0.5495068986488904, + "grad_norm": 0.14953608810901642, + "learning_rate": 0.001, + "loss": 2.1542, + "num_input_tokens_seen": 30194124192, + "step": 57600 + }, + { + "epoch": 0.5499839011650787, + "grad_norm": 0.15742124617099762, + "learning_rate": 0.001, + "loss": 2.1323, + "num_input_tokens_seen": 30220323232, + "step": 57650 + }, + { + "epoch": 0.5504609036812669, + "grad_norm": 0.14026963710784912, + "learning_rate": 0.001, + "loss": 2.1545, + "num_input_tokens_seen": 30246537216, + "step": 57700 + }, + { + "epoch": 0.5509379061974552, + "grad_norm": 0.14743369817733765, + "learning_rate": 0.001, + "loss": 2.1414, + "num_input_tokens_seen": 30272747712, + "step": 57750 + }, + { + "epoch": 0.5514149087136435, + "grad_norm": 0.13608640432357788, + "learning_rate": 0.001, + "loss": 2.1511, + "num_input_tokens_seen": 30298946976, + "step": 57800 + }, + { + "epoch": 0.5518919112298317, + "grad_norm": 0.1552729308605194, + "learning_rate": 0.001, + "loss": 2.1574, + "num_input_tokens_seen": 30325148864, + "step": 57850 + }, + { + "epoch": 0.55236891374602, + "grad_norm": 0.13814964890480042, + "learning_rate": 0.001, + "loss": 2.1452, + "num_input_tokens_seen": 30351363072, + "step": 57900 + }, + { + "epoch": 0.5528459162622082, + "grad_norm": 0.14916428923606873, + "learning_rate": 0.001, + "loss": 2.1473, + "num_input_tokens_seen": 30377574912, + "step": 57950 + }, + { + "epoch": 0.5533229187783966, + "grad_norm": 0.14532601833343506, + "learning_rate": 0.001, + "loss": 2.1456, + "num_input_tokens_seen": 30403788864, + "step": 58000 + }, + { + "epoch": 0.5533229187783966, + "eval_loss": 2.068753480911255, + "eval_runtime": 80.9146, + "eval_samples_per_second": 61.794, + "eval_steps_per_second": 15.448, + "num_input_tokens_seen": 30403788864, + "step": 58000 + }, + { + "epoch": 0.5537999212945848, + "grad_norm": 0.15391622483730316, + "learning_rate": 0.001, + "loss": 2.1682, + "num_input_tokens_seen": 30429999072, + "step": 58050 + }, + { + "epoch": 0.5542769238107731, + "grad_norm": 0.1502559632062912, + "learning_rate": 0.001, + "loss": 2.1568, + "num_input_tokens_seen": 30456205120, + "step": 58100 + }, + { + "epoch": 0.5547539263269614, + "grad_norm": 0.13535556197166443, + "learning_rate": 0.001, + "loss": 2.1493, + "num_input_tokens_seen": 30482418400, + "step": 58150 + }, + { + "epoch": 0.5552309288431496, + "grad_norm": 0.14326569437980652, + "learning_rate": 0.001, + "loss": 2.1436, + "num_input_tokens_seen": 30508626208, + "step": 58200 + }, + { + "epoch": 0.5557079313593379, + "grad_norm": 0.14152300357818604, + "learning_rate": 0.001, + "loss": 2.1595, + "num_input_tokens_seen": 30534835584, + "step": 58250 + }, + { + "epoch": 0.5561849338755261, + "grad_norm": 0.14481306076049805, + "learning_rate": 0.001, + "loss": 2.1551, + "num_input_tokens_seen": 30561049984, + "step": 58300 + }, + { + "epoch": 0.5566619363917145, + "grad_norm": 0.13141630589962006, + "learning_rate": 0.001, + "loss": 2.1404, + "num_input_tokens_seen": 30587264384, + "step": 58350 + }, + { + "epoch": 0.5571389389079028, + "grad_norm": 0.15466631948947906, + "learning_rate": 0.001, + "loss": 2.1435, + "num_input_tokens_seen": 30613475328, + "step": 58400 + }, + { + "epoch": 0.557615941424091, + "grad_norm": 0.14728710055351257, + "learning_rate": 0.001, + "loss": 2.1499, + "num_input_tokens_seen": 30639680256, + "step": 58450 + }, + { + "epoch": 0.5580929439402793, + "grad_norm": 0.14924204349517822, + "learning_rate": 0.001, + "loss": 2.1591, + "num_input_tokens_seen": 30665890560, + "step": 58500 + }, + { + "epoch": 0.5580929439402793, + "eval_loss": 2.066398859024048, + "eval_runtime": 80.4535, + "eval_samples_per_second": 62.148, + "eval_steps_per_second": 15.537, + "num_input_tokens_seen": 30665890560, + "step": 58500 + }, + { + "epoch": 0.5585699464564675, + "grad_norm": 0.1510065197944641, + "learning_rate": 0.001, + "loss": 2.1457, + "num_input_tokens_seen": 30692103392, + "step": 58550 + }, + { + "epoch": 0.5590469489726558, + "grad_norm": 0.13719449937343597, + "learning_rate": 0.001, + "loss": 2.1502, + "num_input_tokens_seen": 30718316864, + "step": 58600 + }, + { + "epoch": 0.5595239514888442, + "grad_norm": 0.1269613802433014, + "learning_rate": 0.001, + "loss": 2.1563, + "num_input_tokens_seen": 30744530464, + "step": 58650 + }, + { + "epoch": 0.5600009540050324, + "grad_norm": 0.15356452763080597, + "learning_rate": 0.001, + "loss": 2.1496, + "num_input_tokens_seen": 30770743616, + "step": 58700 + }, + { + "epoch": 0.5604779565212207, + "grad_norm": 0.1365087777376175, + "learning_rate": 0.001, + "loss": 2.1601, + "num_input_tokens_seen": 30796958016, + "step": 58750 + }, + { + "epoch": 0.5609549590374089, + "grad_norm": 0.15105237066745758, + "learning_rate": 0.001, + "loss": 2.1504, + "num_input_tokens_seen": 30823166336, + "step": 58800 + }, + { + "epoch": 0.5614319615535972, + "grad_norm": 0.15393078327178955, + "learning_rate": 0.001, + "loss": 2.1523, + "num_input_tokens_seen": 30849379456, + "step": 58850 + }, + { + "epoch": 0.5619089640697855, + "grad_norm": 0.15258848667144775, + "learning_rate": 0.001, + "loss": 2.1679, + "num_input_tokens_seen": 30875585280, + "step": 58900 + }, + { + "epoch": 0.5623859665859737, + "grad_norm": 0.14167654514312744, + "learning_rate": 0.001, + "loss": 2.1626, + "num_input_tokens_seen": 30901791328, + "step": 58950 + }, + { + "epoch": 0.562862969102162, + "grad_norm": 0.1519978791475296, + "learning_rate": 0.001, + "loss": 2.1508, + "num_input_tokens_seen": 30927998464, + "step": 59000 + }, + { + "epoch": 0.562862969102162, + "eval_loss": 2.067110300064087, + "eval_runtime": 80.044, + "eval_samples_per_second": 62.466, + "eval_steps_per_second": 15.616, + "num_input_tokens_seen": 30927998464, + "step": 59000 + }, + { + "epoch": 0.5633399716183503, + "grad_norm": 0.1528017818927765, + "learning_rate": 0.001, + "loss": 2.1535, + "num_input_tokens_seen": 30954208032, + "step": 59050 + }, + { + "epoch": 0.5638169741345386, + "grad_norm": 0.13762027025222778, + "learning_rate": 0.001, + "loss": 2.1442, + "num_input_tokens_seen": 30980422432, + "step": 59100 + }, + { + "epoch": 0.5642939766507268, + "grad_norm": 0.15064965188503265, + "learning_rate": 0.001, + "loss": 2.1572, + "num_input_tokens_seen": 31006636832, + "step": 59150 + }, + { + "epoch": 0.5647709791669151, + "grad_norm": 0.14274545013904572, + "learning_rate": 0.001, + "loss": 2.1514, + "num_input_tokens_seen": 31032842400, + "step": 59200 + }, + { + "epoch": 0.5652479816831034, + "grad_norm": 0.15505096316337585, + "learning_rate": 0.001, + "loss": 2.1544, + "num_input_tokens_seen": 31059054240, + "step": 59250 + }, + { + "epoch": 0.5657249841992916, + "grad_norm": 0.1395845264196396, + "learning_rate": 0.001, + "loss": 2.1578, + "num_input_tokens_seen": 31085264224, + "step": 59300 + }, + { + "epoch": 0.5662019867154799, + "grad_norm": 0.14424628019332886, + "learning_rate": 0.001, + "loss": 2.1521, + "num_input_tokens_seen": 31111477504, + "step": 59350 + }, + { + "epoch": 0.5666789892316682, + "grad_norm": 0.14009548723697662, + "learning_rate": 0.001, + "loss": 2.1393, + "num_input_tokens_seen": 31137691904, + "step": 59400 + }, + { + "epoch": 0.5671559917478565, + "grad_norm": 0.1312466412782669, + "learning_rate": 0.001, + "loss": 2.1489, + "num_input_tokens_seen": 31163906304, + "step": 59450 + }, + { + "epoch": 0.5676329942640448, + "grad_norm": 0.14498138427734375, + "learning_rate": 0.001, + "loss": 2.1466, + "num_input_tokens_seen": 31190116608, + "step": 59500 + }, + { + "epoch": 0.5676329942640448, + "eval_loss": 2.06640362739563, + "eval_runtime": 80.0487, + "eval_samples_per_second": 62.462, + "eval_steps_per_second": 15.615, + "num_input_tokens_seen": 31190116608, + "step": 59500 + }, + { + "epoch": 0.568109996780233, + "grad_norm": 0.13518276810646057, + "learning_rate": 0.001, + "loss": 2.1462, + "num_input_tokens_seen": 31216326496, + "step": 59550 + }, + { + "epoch": 0.5685869992964213, + "grad_norm": 0.14145778119564056, + "learning_rate": 0.001, + "loss": 2.1474, + "num_input_tokens_seen": 31242536256, + "step": 59600 + }, + { + "epoch": 0.5690640018126095, + "grad_norm": 0.14637036621570587, + "learning_rate": 0.001, + "loss": 2.1475, + "num_input_tokens_seen": 31268747936, + "step": 59650 + }, + { + "epoch": 0.5695410043287978, + "grad_norm": 0.14279405772686005, + "learning_rate": 0.001, + "loss": 2.1417, + "num_input_tokens_seen": 31294956608, + "step": 59700 + }, + { + "epoch": 0.5700180068449862, + "grad_norm": 0.14410801231861115, + "learning_rate": 0.001, + "loss": 2.1456, + "num_input_tokens_seen": 31321167264, + "step": 59750 + }, + { + "epoch": 0.5704950093611744, + "grad_norm": 0.14293836057186127, + "learning_rate": 0.001, + "loss": 2.1475, + "num_input_tokens_seen": 31347378656, + "step": 59800 + }, + { + "epoch": 0.5709720118773627, + "grad_norm": 0.15702761709690094, + "learning_rate": 0.001, + "loss": 2.1488, + "num_input_tokens_seen": 31373586176, + "step": 59850 + }, + { + "epoch": 0.5714490143935509, + "grad_norm": 0.15636380016803741, + "learning_rate": 0.001, + "loss": 2.1426, + "num_input_tokens_seen": 31399797216, + "step": 59900 + }, + { + "epoch": 0.5719260169097392, + "grad_norm": 0.13920319080352783, + "learning_rate": 0.001, + "loss": 2.148, + "num_input_tokens_seen": 31426011616, + "step": 59950 + }, + { + "epoch": 0.5724030194259274, + "grad_norm": 0.14815059304237366, + "learning_rate": 0.001, + "loss": 2.1457, + "num_input_tokens_seen": 31452217632, + "step": 60000 + }, + { + "epoch": 0.5724030194259274, + "eval_loss": 2.064025402069092, + "eval_runtime": 79.9516, + "eval_samples_per_second": 62.538, + "eval_steps_per_second": 15.634, + "num_input_tokens_seen": 31452217632, + "step": 60000 + }, + { + "epoch": 0.5728800219421157, + "grad_norm": 0.15377280116081238, + "learning_rate": 0.001, + "loss": 2.1499, + "num_input_tokens_seen": 31478418880, + "step": 60050 + }, + { + "epoch": 0.573357024458304, + "grad_norm": 0.16740241646766663, + "learning_rate": 0.001, + "loss": 2.149, + "num_input_tokens_seen": 31504632704, + "step": 60100 + }, + { + "epoch": 0.5738340269744923, + "grad_norm": 0.1350049525499344, + "learning_rate": 0.001, + "loss": 2.1432, + "num_input_tokens_seen": 31530844128, + "step": 60150 + }, + { + "epoch": 0.5743110294906806, + "grad_norm": 0.145762100815773, + "learning_rate": 0.001, + "loss": 2.1492, + "num_input_tokens_seen": 31557058528, + "step": 60200 + }, + { + "epoch": 0.5747880320068688, + "grad_norm": 0.1441580355167389, + "learning_rate": 0.001, + "loss": 2.1417, + "num_input_tokens_seen": 31583268320, + "step": 60250 + }, + { + "epoch": 0.5752650345230571, + "grad_norm": 0.153322234749794, + "learning_rate": 0.001, + "loss": 2.1508, + "num_input_tokens_seen": 31609470656, + "step": 60300 + }, + { + "epoch": 0.5757420370392454, + "grad_norm": 0.14399965107440948, + "learning_rate": 0.001, + "loss": 2.1421, + "num_input_tokens_seen": 31635684704, + "step": 60350 + }, + { + "epoch": 0.5762190395554336, + "grad_norm": 0.13685567677021027, + "learning_rate": 0.001, + "loss": 2.1406, + "num_input_tokens_seen": 31661895968, + "step": 60400 + }, + { + "epoch": 0.576696042071622, + "grad_norm": 0.21189793944358826, + "learning_rate": 0.001, + "loss": 2.1565, + "num_input_tokens_seen": 31688104736, + "step": 60450 + }, + { + "epoch": 0.5771730445878102, + "grad_norm": 0.13776901364326477, + "learning_rate": 0.001, + "loss": 2.1496, + "num_input_tokens_seen": 31714314848, + "step": 60500 + }, + { + "epoch": 0.5771730445878102, + "eval_loss": 2.063593864440918, + "eval_runtime": 80.1547, + "eval_samples_per_second": 62.379, + "eval_steps_per_second": 15.595, + "num_input_tokens_seen": 31714314848, + "step": 60500 + }, + { + "epoch": 0.5776500471039985, + "grad_norm": 0.1537674516439438, + "learning_rate": 0.001, + "loss": 2.1439, + "num_input_tokens_seen": 31740523712, + "step": 60550 + }, + { + "epoch": 0.5781270496201868, + "grad_norm": 0.1462978571653366, + "learning_rate": 0.001, + "loss": 2.1539, + "num_input_tokens_seen": 31766736480, + "step": 60600 + }, + { + "epoch": 0.578604052136375, + "grad_norm": 0.14669708907604218, + "learning_rate": 0.001, + "loss": 2.1576, + "num_input_tokens_seen": 31792947904, + "step": 60650 + }, + { + "epoch": 0.5790810546525633, + "grad_norm": 0.1519545167684555, + "learning_rate": 0.001, + "loss": 2.1453, + "num_input_tokens_seen": 31819160672, + "step": 60700 + }, + { + "epoch": 0.5795580571687515, + "grad_norm": 0.1466340571641922, + "learning_rate": 0.001, + "loss": 2.1434, + "num_input_tokens_seen": 31845374816, + "step": 60750 + }, + { + "epoch": 0.5800350596849398, + "grad_norm": 0.13935734331607819, + "learning_rate": 0.001, + "loss": 2.1423, + "num_input_tokens_seen": 31871587232, + "step": 60800 + }, + { + "epoch": 0.5805120622011282, + "grad_norm": 0.14017197489738464, + "learning_rate": 0.001, + "loss": 2.14, + "num_input_tokens_seen": 31897801280, + "step": 60850 + }, + { + "epoch": 0.5809890647173164, + "grad_norm": 0.14253723621368408, + "learning_rate": 0.001, + "loss": 2.1387, + "num_input_tokens_seen": 31924014720, + "step": 60900 + }, + { + "epoch": 0.5814660672335047, + "grad_norm": 0.15480197966098785, + "learning_rate": 0.001, + "loss": 2.1647, + "num_input_tokens_seen": 31950220448, + "step": 60950 + }, + { + "epoch": 0.5819430697496929, + "grad_norm": 0.1502438485622406, + "learning_rate": 0.001, + "loss": 2.1418, + "num_input_tokens_seen": 31976431072, + "step": 61000 + }, + { + "epoch": 0.5819430697496929, + "eval_loss": 2.0649499893188477, + "eval_runtime": 80.1357, + "eval_samples_per_second": 62.394, + "eval_steps_per_second": 15.599, + "num_input_tokens_seen": 31976431072, + "step": 61000 + }, + { + "epoch": 0.5824200722658812, + "grad_norm": 0.14360016584396362, + "learning_rate": 0.001, + "loss": 2.1464, + "num_input_tokens_seen": 32002645472, + "step": 61050 + }, + { + "epoch": 0.5828970747820694, + "grad_norm": 0.1369880586862564, + "learning_rate": 0.001, + "loss": 2.1556, + "num_input_tokens_seen": 32028853216, + "step": 61100 + }, + { + "epoch": 0.5833740772982577, + "grad_norm": 0.1452026516199112, + "learning_rate": 0.001, + "loss": 2.1354, + "num_input_tokens_seen": 32055062080, + "step": 61150 + }, + { + "epoch": 0.5838510798144461, + "grad_norm": 0.14710959792137146, + "learning_rate": 0.001, + "loss": 2.1472, + "num_input_tokens_seen": 32081272192, + "step": 61200 + }, + { + "epoch": 0.5843280823306343, + "grad_norm": 0.14364252984523773, + "learning_rate": 0.001, + "loss": 2.1562, + "num_input_tokens_seen": 32107485248, + "step": 61250 + }, + { + "epoch": 0.5848050848468226, + "grad_norm": 0.1464463770389557, + "learning_rate": 0.001, + "loss": 2.1455, + "num_input_tokens_seen": 32133699648, + "step": 61300 + }, + { + "epoch": 0.5852820873630108, + "grad_norm": 0.13232292234897614, + "learning_rate": 0.001, + "loss": 2.1515, + "num_input_tokens_seen": 32159909088, + "step": 61350 + }, + { + "epoch": 0.5857590898791991, + "grad_norm": 0.14374487102031708, + "learning_rate": 0.001, + "loss": 2.1447, + "num_input_tokens_seen": 32186118336, + "step": 61400 + }, + { + "epoch": 0.5862360923953874, + "grad_norm": 0.13590660691261292, + "learning_rate": 0.001, + "loss": 2.1431, + "num_input_tokens_seen": 32212322976, + "step": 61450 + }, + { + "epoch": 0.5867130949115756, + "grad_norm": 0.14497828483581543, + "learning_rate": 0.001, + "loss": 2.1477, + "num_input_tokens_seen": 32238532768, + "step": 61500 + }, + { + "epoch": 0.5867130949115756, + "eval_loss": 2.0637688636779785, + "eval_runtime": 80.1948, + "eval_samples_per_second": 62.348, + "eval_steps_per_second": 15.587, + "num_input_tokens_seen": 32238532768, + "step": 61500 + }, + { + "epoch": 0.587190097427764, + "grad_norm": 0.15234364569187164, + "learning_rate": 0.001, + "loss": 2.1518, + "num_input_tokens_seen": 32264747168, + "step": 61550 + }, + { + "epoch": 0.5876670999439522, + "grad_norm": 0.15645268559455872, + "learning_rate": 0.001, + "loss": 2.1417, + "num_input_tokens_seen": 32290958592, + "step": 61600 + }, + { + "epoch": 0.5881441024601405, + "grad_norm": 0.13095822930335999, + "learning_rate": 0.001, + "loss": 2.1556, + "num_input_tokens_seen": 32317172864, + "step": 61650 + }, + { + "epoch": 0.5886211049763288, + "grad_norm": 0.14591479301452637, + "learning_rate": 0.001, + "loss": 2.1427, + "num_input_tokens_seen": 32343387264, + "step": 61700 + }, + { + "epoch": 0.589098107492517, + "grad_norm": 0.14499343931674957, + "learning_rate": 0.001, + "loss": 2.1476, + "num_input_tokens_seen": 32369597760, + "step": 61750 + }, + { + "epoch": 0.5895751100087053, + "grad_norm": 0.1538584977388382, + "learning_rate": 0.001, + "loss": 2.1479, + "num_input_tokens_seen": 32395811904, + "step": 61800 + }, + { + "epoch": 0.5900521125248935, + "grad_norm": 0.14564937353134155, + "learning_rate": 0.001, + "loss": 2.1369, + "num_input_tokens_seen": 32422005088, + "step": 61850 + }, + { + "epoch": 0.5905291150410819, + "grad_norm": 0.15571440756320953, + "learning_rate": 0.001, + "loss": 2.1452, + "num_input_tokens_seen": 32448215552, + "step": 61900 + }, + { + "epoch": 0.5910061175572701, + "grad_norm": 0.15152624249458313, + "learning_rate": 0.001, + "loss": 2.1501, + "num_input_tokens_seen": 32474410528, + "step": 61950 + }, + { + "epoch": 0.5914831200734584, + "grad_norm": 0.14020481705665588, + "learning_rate": 0.001, + "loss": 2.137, + "num_input_tokens_seen": 32500617568, + "step": 62000 + }, + { + "epoch": 0.5914831200734584, + "eval_loss": 2.0610291957855225, + "eval_runtime": 80.4672, + "eval_samples_per_second": 62.137, + "eval_steps_per_second": 15.534, + "num_input_tokens_seen": 32500617568, + "step": 62000 + }, + { + "epoch": 0.5919601225896467, + "grad_norm": 0.13953524827957153, + "learning_rate": 0.001, + "loss": 2.1443, + "num_input_tokens_seen": 32526826016, + "step": 62050 + }, + { + "epoch": 0.5924371251058349, + "grad_norm": 0.13924415409564972, + "learning_rate": 0.001, + "loss": 2.1439, + "num_input_tokens_seen": 32553032032, + "step": 62100 + }, + { + "epoch": 0.5929141276220232, + "grad_norm": 0.15134859085083008, + "learning_rate": 0.001, + "loss": 2.1457, + "num_input_tokens_seen": 32579225888, + "step": 62150 + }, + { + "epoch": 0.5933911301382114, + "grad_norm": 0.14563137292861938, + "learning_rate": 0.001, + "loss": 2.1426, + "num_input_tokens_seen": 32605435776, + "step": 62200 + }, + { + "epoch": 0.5938681326543997, + "grad_norm": 0.14072252810001373, + "learning_rate": 0.001, + "loss": 2.1561, + "num_input_tokens_seen": 32631646944, + "step": 62250 + }, + { + "epoch": 0.5943451351705881, + "grad_norm": 0.1321408897638321, + "learning_rate": 0.001, + "loss": 2.1328, + "num_input_tokens_seen": 32657860736, + "step": 62300 + }, + { + "epoch": 0.5948221376867763, + "grad_norm": 0.13987164199352264, + "learning_rate": 0.001, + "loss": 2.1409, + "num_input_tokens_seen": 32684073568, + "step": 62350 + }, + { + "epoch": 0.5952991402029646, + "grad_norm": 0.14474605023860931, + "learning_rate": 0.001, + "loss": 2.1499, + "num_input_tokens_seen": 32710284704, + "step": 62400 + }, + { + "epoch": 0.5957761427191528, + "grad_norm": 0.14219556748867035, + "learning_rate": 0.001, + "loss": 2.1514, + "num_input_tokens_seen": 32736491200, + "step": 62450 + }, + { + "epoch": 0.5962531452353411, + "grad_norm": 0.15876619517803192, + "learning_rate": 0.001, + "loss": 2.1415, + "num_input_tokens_seen": 32762704928, + "step": 62500 + }, + { + "epoch": 0.5962531452353411, + "eval_loss": 2.0606014728546143, + "eval_runtime": 80.5848, + "eval_samples_per_second": 62.046, + "eval_steps_per_second": 15.512, + "num_input_tokens_seen": 32762704928, + "step": 62500 + }, + { + "epoch": 0.5967301477515294, + "grad_norm": 0.14429977536201477, + "learning_rate": 0.001, + "loss": 2.1462, + "num_input_tokens_seen": 32788919328, + "step": 62550 + }, + { + "epoch": 0.5972071502677176, + "grad_norm": 0.1436418741941452, + "learning_rate": 0.001, + "loss": 2.1495, + "num_input_tokens_seen": 32815127776, + "step": 62600 + }, + { + "epoch": 0.597684152783906, + "grad_norm": 0.14079809188842773, + "learning_rate": 0.001, + "loss": 2.1478, + "num_input_tokens_seen": 32841342144, + "step": 62650 + }, + { + "epoch": 0.5981611553000942, + "grad_norm": 0.13799606263637543, + "learning_rate": 0.001, + "loss": 2.1487, + "num_input_tokens_seen": 32867549472, + "step": 62700 + }, + { + "epoch": 0.5986381578162825, + "grad_norm": 0.13869820535182953, + "learning_rate": 0.001, + "loss": 2.1379, + "num_input_tokens_seen": 32893759680, + "step": 62750 + }, + { + "epoch": 0.5991151603324708, + "grad_norm": 0.1471201330423355, + "learning_rate": 0.001, + "loss": 2.1573, + "num_input_tokens_seen": 32919968544, + "step": 62800 + }, + { + "epoch": 0.599592162848659, + "grad_norm": 0.14617429673671722, + "learning_rate": 0.001, + "loss": 2.1338, + "num_input_tokens_seen": 32946179008, + "step": 62850 + }, + { + "epoch": 0.6000691653648473, + "grad_norm": 0.14857004582881927, + "learning_rate": 0.001, + "loss": 2.1417, + "num_input_tokens_seen": 32972393408, + "step": 62900 + }, + { + "epoch": 0.6005461678810355, + "grad_norm": 0.1417587846517563, + "learning_rate": 0.001, + "loss": 2.1414, + "num_input_tokens_seen": 32998607808, + "step": 62950 + }, + { + "epoch": 0.6010231703972239, + "grad_norm": 0.13704361021518707, + "learning_rate": 0.001, + "loss": 2.1459, + "num_input_tokens_seen": 33024820736, + "step": 63000 + }, + { + "epoch": 0.6010231703972239, + "eval_loss": 2.0603103637695312, + "eval_runtime": 79.9162, + "eval_samples_per_second": 62.566, + "eval_steps_per_second": 15.641, + "num_input_tokens_seen": 33024820736, + "step": 63000 + }, + { + "epoch": 0.6015001729134121, + "grad_norm": 0.13333925604820251, + "learning_rate": 0.001, + "loss": 2.1431, + "num_input_tokens_seen": 33051029952, + "step": 63050 + }, + { + "epoch": 0.6019771754296004, + "grad_norm": 0.1660204976797104, + "learning_rate": 0.001, + "loss": 2.1439, + "num_input_tokens_seen": 33077244352, + "step": 63100 + }, + { + "epoch": 0.6024541779457887, + "grad_norm": 0.14403057098388672, + "learning_rate": 0.001, + "loss": 2.1485, + "num_input_tokens_seen": 33103448480, + "step": 63150 + }, + { + "epoch": 0.6029311804619769, + "grad_norm": 0.13897418975830078, + "learning_rate": 0.001, + "loss": 2.1481, + "num_input_tokens_seen": 33129662848, + "step": 63200 + }, + { + "epoch": 0.6034081829781652, + "grad_norm": 0.1460401862859726, + "learning_rate": 0.001, + "loss": 2.153, + "num_input_tokens_seen": 33155874816, + "step": 63250 + }, + { + "epoch": 0.6038851854943534, + "grad_norm": 0.1433630883693695, + "learning_rate": 0.001, + "loss": 2.1408, + "num_input_tokens_seen": 33182089024, + "step": 63300 + }, + { + "epoch": 0.6043621880105418, + "grad_norm": 0.1596335917711258, + "learning_rate": 0.001, + "loss": 2.1493, + "num_input_tokens_seen": 33208303136, + "step": 63350 + }, + { + "epoch": 0.6048391905267301, + "grad_norm": 0.1354464888572693, + "learning_rate": 0.001, + "loss": 2.1537, + "num_input_tokens_seen": 33234511872, + "step": 63400 + }, + { + "epoch": 0.6053161930429183, + "grad_norm": 0.14532111585140228, + "learning_rate": 0.001, + "loss": 2.1365, + "num_input_tokens_seen": 33260724384, + "step": 63450 + }, + { + "epoch": 0.6057931955591066, + "grad_norm": 0.14670905470848083, + "learning_rate": 0.001, + "loss": 2.1389, + "num_input_tokens_seen": 33286935872, + "step": 63500 + }, + { + "epoch": 0.6057931955591066, + "eval_loss": 2.058581590652466, + "eval_runtime": 79.4802, + "eval_samples_per_second": 62.909, + "eval_steps_per_second": 15.727, + "num_input_tokens_seen": 33286935872, + "step": 63500 + }, + { + "epoch": 0.6062701980752948, + "grad_norm": 0.1628328263759613, + "learning_rate": 0.001, + "loss": 2.1427, + "num_input_tokens_seen": 33313150272, + "step": 63550 + }, + { + "epoch": 0.6067472005914831, + "grad_norm": 0.14063307642936707, + "learning_rate": 0.001, + "loss": 2.1457, + "num_input_tokens_seen": 33339359200, + "step": 63600 + }, + { + "epoch": 0.6072242031076714, + "grad_norm": 0.1543467491865158, + "learning_rate": 0.001, + "loss": 2.1416, + "num_input_tokens_seen": 33365572768, + "step": 63650 + }, + { + "epoch": 0.6077012056238597, + "grad_norm": 0.13870377838611603, + "learning_rate": 0.001, + "loss": 2.1471, + "num_input_tokens_seen": 33391785248, + "step": 63700 + }, + { + "epoch": 0.608178208140048, + "grad_norm": 0.15191654860973358, + "learning_rate": 0.001, + "loss": 2.137, + "num_input_tokens_seen": 33417994304, + "step": 63750 + }, + { + "epoch": 0.6086552106562362, + "grad_norm": 0.15023675560951233, + "learning_rate": 0.001, + "loss": 2.1513, + "num_input_tokens_seen": 33444202144, + "step": 63800 + }, + { + "epoch": 0.6091322131724245, + "grad_norm": 0.14828945696353912, + "learning_rate": 0.001, + "loss": 2.1359, + "num_input_tokens_seen": 33470406880, + "step": 63850 + }, + { + "epoch": 0.6096092156886127, + "grad_norm": 0.1423346996307373, + "learning_rate": 0.001, + "loss": 2.121, + "num_input_tokens_seen": 33496614304, + "step": 63900 + }, + { + "epoch": 0.610086218204801, + "grad_norm": 0.1419682651758194, + "learning_rate": 0.001, + "loss": 2.1475, + "num_input_tokens_seen": 33522828352, + "step": 63950 + }, + { + "epoch": 0.6105632207209893, + "grad_norm": 0.14732129871845245, + "learning_rate": 0.001, + "loss": 2.1367, + "num_input_tokens_seen": 33549034848, + "step": 64000 + }, + { + "epoch": 0.6105632207209893, + "eval_loss": 2.0587704181671143, + "eval_runtime": 80.0375, + "eval_samples_per_second": 62.471, + "eval_steps_per_second": 15.618, + "num_input_tokens_seen": 33549034848, + "step": 64000 + }, + { + "epoch": 0.6110402232371775, + "grad_norm": 0.1425338238477707, + "learning_rate": 0.001, + "loss": 2.1458, + "num_input_tokens_seen": 33575249248, + "step": 64050 + }, + { + "epoch": 0.6115172257533659, + "grad_norm": 0.16987043619155884, + "learning_rate": 0.001, + "loss": 2.1534, + "num_input_tokens_seen": 33601461568, + "step": 64100 + }, + { + "epoch": 0.6119942282695541, + "grad_norm": 0.1452094316482544, + "learning_rate": 0.001, + "loss": 2.139, + "num_input_tokens_seen": 33627675392, + "step": 64150 + }, + { + "epoch": 0.6124712307857424, + "grad_norm": 0.14077788591384888, + "learning_rate": 0.001, + "loss": 2.1396, + "num_input_tokens_seen": 33653881632, + "step": 64200 + }, + { + "epoch": 0.6129482333019307, + "grad_norm": 0.14799240231513977, + "learning_rate": 0.001, + "loss": 2.1837, + "num_input_tokens_seen": 33680096032, + "step": 64250 + }, + { + "epoch": 0.6134252358181189, + "grad_norm": 0.14731177687644958, + "learning_rate": 0.001, + "loss": 2.1631, + "num_input_tokens_seen": 33706309408, + "step": 64300 + }, + { + "epoch": 0.6139022383343072, + "grad_norm": 0.14560987055301666, + "learning_rate": 0.001, + "loss": 2.1464, + "num_input_tokens_seen": 33732520736, + "step": 64350 + }, + { + "epoch": 0.6143792408504954, + "grad_norm": 0.15761514008045197, + "learning_rate": 0.001, + "loss": 2.1485, + "num_input_tokens_seen": 33758730752, + "step": 64400 + }, + { + "epoch": 0.6148562433666838, + "grad_norm": 0.14389316737651825, + "learning_rate": 0.001, + "loss": 2.1406, + "num_input_tokens_seen": 33784943872, + "step": 64450 + }, + { + "epoch": 0.6153332458828721, + "grad_norm": 0.12751001119613647, + "learning_rate": 0.001, + "loss": 2.147, + "num_input_tokens_seen": 33811149696, + "step": 64500 + }, + { + "epoch": 0.6153332458828721, + "eval_loss": 2.059323787689209, + "eval_runtime": 80.4929, + "eval_samples_per_second": 62.117, + "eval_steps_per_second": 15.529, + "num_input_tokens_seen": 33811149696, + "step": 64500 + }, + { + "epoch": 0.6158102483990603, + "grad_norm": 0.1340908706188202, + "learning_rate": 0.001, + "loss": 2.1525, + "num_input_tokens_seen": 33837360224, + "step": 64550 + }, + { + "epoch": 0.6162872509152486, + "grad_norm": 0.1480954885482788, + "learning_rate": 0.001, + "loss": 2.1394, + "num_input_tokens_seen": 33863574624, + "step": 64600 + }, + { + "epoch": 0.6167642534314368, + "grad_norm": 0.14949338138103485, + "learning_rate": 0.001, + "loss": 2.1289, + "num_input_tokens_seen": 33889784992, + "step": 64650 + }, + { + "epoch": 0.6172412559476251, + "grad_norm": 0.13703715801239014, + "learning_rate": 0.001, + "loss": 2.1479, + "num_input_tokens_seen": 33915999392, + "step": 64700 + }, + { + "epoch": 0.6177182584638133, + "grad_norm": 0.1442926824092865, + "learning_rate": 0.001, + "loss": 2.1409, + "num_input_tokens_seen": 33942206240, + "step": 64750 + }, + { + "epoch": 0.6181952609800017, + "grad_norm": 0.12989823520183563, + "learning_rate": 0.001, + "loss": 2.1306, + "num_input_tokens_seen": 33968419968, + "step": 64800 + }, + { + "epoch": 0.61867226349619, + "grad_norm": 0.14975020289421082, + "learning_rate": 0.001, + "loss": 2.1391, + "num_input_tokens_seen": 33994629952, + "step": 64850 + }, + { + "epoch": 0.6191492660123782, + "grad_norm": 0.1355099231004715, + "learning_rate": 0.001, + "loss": 2.1449, + "num_input_tokens_seen": 34020843104, + "step": 64900 + }, + { + "epoch": 0.6196262685285665, + "grad_norm": 0.13653282821178436, + "learning_rate": 0.001, + "loss": 2.1398, + "num_input_tokens_seen": 34047052768, + "step": 64950 + }, + { + "epoch": 0.6201032710447547, + "grad_norm": 0.1443834900856018, + "learning_rate": 0.001, + "loss": 2.1415, + "num_input_tokens_seen": 34073267168, + "step": 65000 + }, + { + "epoch": 0.6201032710447547, + "eval_loss": 2.0579845905303955, + "eval_runtime": 79.8036, + "eval_samples_per_second": 62.654, + "eval_steps_per_second": 15.663, + "num_input_tokens_seen": 34073267168, + "step": 65000 + }, + { + "epoch": 0.620580273560943, + "grad_norm": 0.13972166180610657, + "learning_rate": 0.001, + "loss": 2.1469, + "num_input_tokens_seen": 34099472608, + "step": 65050 + }, + { + "epoch": 0.6210572760771313, + "grad_norm": 0.15001018345355988, + "learning_rate": 0.001, + "loss": 2.1441, + "num_input_tokens_seen": 34125685056, + "step": 65100 + }, + { + "epoch": 0.6215342785933196, + "grad_norm": 0.13544200360774994, + "learning_rate": 0.001, + "loss": 2.1468, + "num_input_tokens_seen": 34151895488, + "step": 65150 + }, + { + "epoch": 0.6220112811095079, + "grad_norm": 0.13995452225208282, + "learning_rate": 0.001, + "loss": 2.1344, + "num_input_tokens_seen": 34178106848, + "step": 65200 + }, + { + "epoch": 0.6224882836256961, + "grad_norm": 0.14656807482242584, + "learning_rate": 0.001, + "loss": 2.1354, + "num_input_tokens_seen": 34204315008, + "step": 65250 + }, + { + "epoch": 0.6229652861418844, + "grad_norm": 0.14047770202159882, + "learning_rate": 0.001, + "loss": 2.1484, + "num_input_tokens_seen": 34230529184, + "step": 65300 + }, + { + "epoch": 0.6234422886580727, + "grad_norm": 0.1519668698310852, + "learning_rate": 0.001, + "loss": 2.146, + "num_input_tokens_seen": 34256735776, + "step": 65350 + }, + { + "epoch": 0.6239192911742609, + "grad_norm": 0.15893806517124176, + "learning_rate": 0.001, + "loss": 2.1439, + "num_input_tokens_seen": 34282942784, + "step": 65400 + }, + { + "epoch": 0.6243962936904492, + "grad_norm": 0.15153321623802185, + "learning_rate": 0.001, + "loss": 2.1423, + "num_input_tokens_seen": 34309151360, + "step": 65450 + }, + { + "epoch": 0.6248732962066375, + "grad_norm": 0.13959857821464539, + "learning_rate": 0.001, + "loss": 2.1426, + "num_input_tokens_seen": 34335361632, + "step": 65500 + }, + { + "epoch": 0.6248732962066375, + "eval_loss": 2.056854724884033, + "eval_runtime": 80.3509, + "eval_samples_per_second": 62.227, + "eval_steps_per_second": 15.557, + "num_input_tokens_seen": 34335361632, + "step": 65500 + }, + { + "epoch": 0.6253502987228258, + "grad_norm": 0.1324777901172638, + "learning_rate": 0.001, + "loss": 2.1397, + "num_input_tokens_seen": 34361575424, + "step": 65550 + }, + { + "epoch": 0.6258273012390141, + "grad_norm": 0.14406649768352509, + "learning_rate": 0.001, + "loss": 2.1469, + "num_input_tokens_seen": 34387789824, + "step": 65600 + }, + { + "epoch": 0.6263043037552023, + "grad_norm": 0.14909878373146057, + "learning_rate": 0.001, + "loss": 2.1485, + "num_input_tokens_seen": 34414002464, + "step": 65650 + }, + { + "epoch": 0.6267813062713906, + "grad_norm": 0.14565107226371765, + "learning_rate": 0.001, + "loss": 2.1459, + "num_input_tokens_seen": 34440213152, + "step": 65700 + }, + { + "epoch": 0.6272583087875788, + "grad_norm": 0.1643950343132019, + "learning_rate": 0.001, + "loss": 2.1311, + "num_input_tokens_seen": 34466427104, + "step": 65750 + }, + { + "epoch": 0.6277353113037671, + "grad_norm": 0.14298604428768158, + "learning_rate": 0.001, + "loss": 2.1452, + "num_input_tokens_seen": 34492641472, + "step": 65800 + }, + { + "epoch": 0.6282123138199553, + "grad_norm": 0.1350494772195816, + "learning_rate": 0.001, + "loss": 2.1349, + "num_input_tokens_seen": 34518851072, + "step": 65850 + }, + { + "epoch": 0.6286893163361437, + "grad_norm": 0.15055470168590546, + "learning_rate": 0.001, + "loss": 2.1446, + "num_input_tokens_seen": 34545063136, + "step": 65900 + }, + { + "epoch": 0.629166318852332, + "grad_norm": 0.15200093388557434, + "learning_rate": 0.001, + "loss": 2.1377, + "num_input_tokens_seen": 34571258560, + "step": 65950 + }, + { + "epoch": 0.6296433213685202, + "grad_norm": 0.1419169306755066, + "learning_rate": 0.001, + "loss": 2.1483, + "num_input_tokens_seen": 34597457472, + "step": 66000 + }, + { + "epoch": 0.6296433213685202, + "eval_loss": 2.0544610023498535, + "eval_runtime": 80.091, + "eval_samples_per_second": 62.429, + "eval_steps_per_second": 15.607, + "num_input_tokens_seen": 34597457472, + "step": 66000 + }, + { + "epoch": 0.6301203238847085, + "grad_norm": 0.14043961465358734, + "learning_rate": 0.001, + "loss": 2.1347, + "num_input_tokens_seen": 34623670848, + "step": 66050 + }, + { + "epoch": 0.6305973264008967, + "grad_norm": 0.14886051416397095, + "learning_rate": 0.001, + "loss": 2.1338, + "num_input_tokens_seen": 34649880224, + "step": 66100 + }, + { + "epoch": 0.631074328917085, + "grad_norm": 0.14698228240013123, + "learning_rate": 0.001, + "loss": 2.1375, + "num_input_tokens_seen": 34676090752, + "step": 66150 + }, + { + "epoch": 0.6315513314332734, + "grad_norm": 0.1423393338918686, + "learning_rate": 0.001, + "loss": 2.1377, + "num_input_tokens_seen": 34702305152, + "step": 66200 + }, + { + "epoch": 0.6320283339494616, + "grad_norm": 0.1530950367450714, + "learning_rate": 0.001, + "loss": 2.1421, + "num_input_tokens_seen": 34728510528, + "step": 66250 + }, + { + "epoch": 0.6325053364656499, + "grad_norm": 0.14289388060569763, + "learning_rate": 0.001, + "loss": 2.1482, + "num_input_tokens_seen": 34754723296, + "step": 66300 + }, + { + "epoch": 0.6329823389818381, + "grad_norm": 0.15754513442516327, + "learning_rate": 0.001, + "loss": 2.1319, + "num_input_tokens_seen": 34780937152, + "step": 66350 + }, + { + "epoch": 0.6334593414980264, + "grad_norm": 0.14707081019878387, + "learning_rate": 0.001, + "loss": 2.1441, + "num_input_tokens_seen": 34807151552, + "step": 66400 + }, + { + "epoch": 0.6339363440142147, + "grad_norm": 0.13461631536483765, + "learning_rate": 0.001, + "loss": 2.1467, + "num_input_tokens_seen": 34833363968, + "step": 66450 + }, + { + "epoch": 0.6344133465304029, + "grad_norm": 0.14467968046665192, + "learning_rate": 0.001, + "loss": 2.1409, + "num_input_tokens_seen": 34859578368, + "step": 66500 + }, + { + "epoch": 0.6344133465304029, + "eval_loss": 2.0547854900360107, + "eval_runtime": 80.2151, + "eval_samples_per_second": 62.332, + "eval_steps_per_second": 15.583, + "num_input_tokens_seen": 34859578368, + "step": 66500 + }, + { + "epoch": 0.6348903490465913, + "grad_norm": 0.14870643615722656, + "learning_rate": 0.001, + "loss": 2.1386, + "num_input_tokens_seen": 34885791584, + "step": 66550 + }, + { + "epoch": 0.6353673515627795, + "grad_norm": 0.15154273808002472, + "learning_rate": 0.001, + "loss": 2.1329, + "num_input_tokens_seen": 34911990016, + "step": 66600 + }, + { + "epoch": 0.6358443540789678, + "grad_norm": 0.14637821912765503, + "learning_rate": 0.001, + "loss": 2.1307, + "num_input_tokens_seen": 34938204416, + "step": 66650 + }, + { + "epoch": 0.636321356595156, + "grad_norm": 0.15013527870178223, + "learning_rate": 0.001, + "loss": 2.1406, + "num_input_tokens_seen": 34964415968, + "step": 66700 + }, + { + "epoch": 0.6367983591113443, + "grad_norm": 0.14377915859222412, + "learning_rate": 0.001, + "loss": 2.1395, + "num_input_tokens_seen": 34990625568, + "step": 66750 + }, + { + "epoch": 0.6372753616275326, + "grad_norm": 0.14643235504627228, + "learning_rate": 0.001, + "loss": 2.1416, + "num_input_tokens_seen": 35016834496, + "step": 66800 + }, + { + "epoch": 0.6377523641437208, + "grad_norm": 0.1544150412082672, + "learning_rate": 0.001, + "loss": 2.1344, + "num_input_tokens_seen": 35043044928, + "step": 66850 + }, + { + "epoch": 0.6382293666599091, + "grad_norm": 0.15571437776088715, + "learning_rate": 0.001, + "loss": 2.1259, + "num_input_tokens_seen": 35069259328, + "step": 66900 + }, + { + "epoch": 0.6387063691760974, + "grad_norm": 0.15925458073616028, + "learning_rate": 0.001, + "loss": 2.1329, + "num_input_tokens_seen": 35095468736, + "step": 66950 + }, + { + "epoch": 0.6391833716922857, + "grad_norm": 0.14532826840877533, + "learning_rate": 0.001, + "loss": 2.1368, + "num_input_tokens_seen": 35121682240, + "step": 67000 + }, + { + "epoch": 0.6391833716922857, + "eval_loss": 2.055466651916504, + "eval_runtime": 80.6411, + "eval_samples_per_second": 62.003, + "eval_steps_per_second": 15.501, + "num_input_tokens_seen": 35121682240, + "step": 67000 + }, + { + "epoch": 0.639660374208474, + "grad_norm": 0.13905447721481323, + "learning_rate": 0.001, + "loss": 2.133, + "num_input_tokens_seen": 35147896096, + "step": 67050 + }, + { + "epoch": 0.6401373767246622, + "grad_norm": 0.14530105888843536, + "learning_rate": 0.001, + "loss": 2.1422, + "num_input_tokens_seen": 35174110496, + "step": 67100 + }, + { + "epoch": 0.6406143792408505, + "grad_norm": 0.13817119598388672, + "learning_rate": 0.001, + "loss": 2.1362, + "num_input_tokens_seen": 35200323040, + "step": 67150 + }, + { + "epoch": 0.6410913817570387, + "grad_norm": 0.14851312339305878, + "learning_rate": 0.001, + "loss": 2.1424, + "num_input_tokens_seen": 35226535808, + "step": 67200 + }, + { + "epoch": 0.641568384273227, + "grad_norm": 0.13070625066757202, + "learning_rate": 0.001, + "loss": 2.1355, + "num_input_tokens_seen": 35252750208, + "step": 67250 + }, + { + "epoch": 0.6420453867894154, + "grad_norm": 0.1390138566493988, + "learning_rate": 0.001, + "loss": 2.1422, + "num_input_tokens_seen": 35278951616, + "step": 67300 + }, + { + "epoch": 0.6425223893056036, + "grad_norm": 0.1507682204246521, + "learning_rate": 0.001, + "loss": 2.1436, + "num_input_tokens_seen": 35305166016, + "step": 67350 + }, + { + "epoch": 0.6429993918217919, + "grad_norm": 0.14612345397472382, + "learning_rate": 0.001, + "loss": 2.1408, + "num_input_tokens_seen": 35331376032, + "step": 67400 + }, + { + "epoch": 0.6434763943379801, + "grad_norm": 0.1487749069929123, + "learning_rate": 0.001, + "loss": 2.1365, + "num_input_tokens_seen": 35357590432, + "step": 67450 + }, + { + "epoch": 0.6439533968541684, + "grad_norm": 0.14507658779621124, + "learning_rate": 0.001, + "loss": 2.1366, + "num_input_tokens_seen": 35383796224, + "step": 67500 + }, + { + "epoch": 0.6439533968541684, + "eval_loss": 2.054283380508423, + "eval_runtime": 80.2964, + "eval_samples_per_second": 62.269, + "eval_steps_per_second": 15.567, + "num_input_tokens_seen": 35383796224, + "step": 67500 + }, + { + "epoch": 0.6444303993703567, + "grad_norm": 0.13577920198440552, + "learning_rate": 0.001, + "loss": 2.1375, + "num_input_tokens_seen": 35410002592, + "step": 67550 + }, + { + "epoch": 0.6449074018865449, + "grad_norm": 0.15499469637870789, + "learning_rate": 0.001, + "loss": 2.1412, + "num_input_tokens_seen": 35436216736, + "step": 67600 + }, + { + "epoch": 0.6453844044027333, + "grad_norm": 0.20250071585178375, + "learning_rate": 0.001, + "loss": 2.147, + "num_input_tokens_seen": 35462413280, + "step": 67650 + }, + { + "epoch": 0.6458614069189215, + "grad_norm": 0.1439153552055359, + "learning_rate": 0.001, + "loss": 2.1466, + "num_input_tokens_seen": 35488624192, + "step": 67700 + }, + { + "epoch": 0.6463384094351098, + "grad_norm": 0.153683140873909, + "learning_rate": 0.001, + "loss": 2.1469, + "num_input_tokens_seen": 35514838592, + "step": 67750 + }, + { + "epoch": 0.646815411951298, + "grad_norm": 0.12951679527759552, + "learning_rate": 0.001, + "loss": 2.1405, + "num_input_tokens_seen": 35541050208, + "step": 67800 + }, + { + "epoch": 0.6472924144674863, + "grad_norm": 0.14119641482830048, + "learning_rate": 0.001, + "loss": 2.1339, + "num_input_tokens_seen": 35567256480, + "step": 67850 + }, + { + "epoch": 0.6477694169836746, + "grad_norm": 0.15403713285923004, + "learning_rate": 0.001, + "loss": 2.1376, + "num_input_tokens_seen": 35593470880, + "step": 67900 + }, + { + "epoch": 0.6482464194998628, + "grad_norm": 0.14498727023601532, + "learning_rate": 0.001, + "loss": 2.1461, + "num_input_tokens_seen": 35619682656, + "step": 67950 + }, + { + "epoch": 0.6487234220160512, + "grad_norm": 0.13579486310482025, + "learning_rate": 0.001, + "loss": 2.137, + "num_input_tokens_seen": 35645894016, + "step": 68000 + }, + { + "epoch": 0.6487234220160512, + "eval_loss": 2.0545461177825928, + "eval_runtime": 79.8852, + "eval_samples_per_second": 62.59, + "eval_steps_per_second": 15.647, + "num_input_tokens_seen": 35645894016, + "step": 68000 + }, + { + "epoch": 0.6492004245322394, + "grad_norm": 0.15173807740211487, + "learning_rate": 0.001, + "loss": 2.1376, + "num_input_tokens_seen": 35672102176, + "step": 68050 + }, + { + "epoch": 0.6496774270484277, + "grad_norm": 0.13916128873825073, + "learning_rate": 0.001, + "loss": 2.1341, + "num_input_tokens_seen": 35698316576, + "step": 68100 + }, + { + "epoch": 0.650154429564616, + "grad_norm": 0.14804038405418396, + "learning_rate": 0.001, + "loss": 2.1359, + "num_input_tokens_seen": 35724521824, + "step": 68150 + }, + { + "epoch": 0.6506314320808042, + "grad_norm": 0.17799383401870728, + "learning_rate": 0.001, + "loss": 2.1309, + "num_input_tokens_seen": 35750733344, + "step": 68200 + }, + { + "epoch": 0.6511084345969925, + "grad_norm": 0.14043092727661133, + "learning_rate": 0.001, + "loss": 2.146, + "num_input_tokens_seen": 35776946304, + "step": 68250 + }, + { + "epoch": 0.6515854371131807, + "grad_norm": 0.13235658407211304, + "learning_rate": 0.001, + "loss": 2.1451, + "num_input_tokens_seen": 35803159264, + "step": 68300 + }, + { + "epoch": 0.652062439629369, + "grad_norm": 0.1564619392156601, + "learning_rate": 0.001, + "loss": 2.1384, + "num_input_tokens_seen": 35829368896, + "step": 68350 + }, + { + "epoch": 0.6525394421455574, + "grad_norm": 0.13722547888755798, + "learning_rate": 0.001, + "loss": 2.1321, + "num_input_tokens_seen": 35855575680, + "step": 68400 + }, + { + "epoch": 0.6530164446617456, + "grad_norm": 0.16468219459056854, + "learning_rate": 0.001, + "loss": 2.1276, + "num_input_tokens_seen": 35881785632, + "step": 68450 + }, + { + "epoch": 0.6534934471779339, + "grad_norm": 0.14734308421611786, + "learning_rate": 0.001, + "loss": 2.1342, + "num_input_tokens_seen": 35907993664, + "step": 68500 + }, + { + "epoch": 0.6534934471779339, + "eval_loss": 2.052057981491089, + "eval_runtime": 79.7834, + "eval_samples_per_second": 62.67, + "eval_steps_per_second": 15.667, + "num_input_tokens_seen": 35907993664, + "step": 68500 + }, + { + "epoch": 0.6539704496941221, + "grad_norm": 0.1417292058467865, + "learning_rate": 0.001, + "loss": 2.1392, + "num_input_tokens_seen": 35934201600, + "step": 68550 + }, + { + "epoch": 0.6544474522103104, + "grad_norm": 0.1365908682346344, + "learning_rate": 0.001, + "loss": 2.1464, + "num_input_tokens_seen": 35960403488, + "step": 68600 + }, + { + "epoch": 0.6549244547264986, + "grad_norm": 0.149469256401062, + "learning_rate": 0.001, + "loss": 2.1472, + "num_input_tokens_seen": 35986617888, + "step": 68650 + }, + { + "epoch": 0.655401457242687, + "grad_norm": 0.13806110620498657, + "learning_rate": 0.001, + "loss": 2.1437, + "num_input_tokens_seen": 36012828320, + "step": 68700 + }, + { + "epoch": 0.6558784597588753, + "grad_norm": 0.15856076776981354, + "learning_rate": 0.001, + "loss": 2.1329, + "num_input_tokens_seen": 36039041440, + "step": 68750 + }, + { + "epoch": 0.6563554622750635, + "grad_norm": 0.14309062063694, + "learning_rate": 0.001, + "loss": 2.139, + "num_input_tokens_seen": 36065255840, + "step": 68800 + }, + { + "epoch": 0.6568324647912518, + "grad_norm": 0.14342211186885834, + "learning_rate": 0.001, + "loss": 2.1481, + "num_input_tokens_seen": 36091469600, + "step": 68850 + }, + { + "epoch": 0.65730946730744, + "grad_norm": 0.1576087325811386, + "learning_rate": 0.001, + "loss": 2.1431, + "num_input_tokens_seen": 36117684000, + "step": 68900 + }, + { + "epoch": 0.6577864698236283, + "grad_norm": 0.13672710955142975, + "learning_rate": 0.001, + "loss": 2.1319, + "num_input_tokens_seen": 36143896192, + "step": 68950 + }, + { + "epoch": 0.6582634723398166, + "grad_norm": 0.1360878199338913, + "learning_rate": 0.001, + "loss": 2.1388, + "num_input_tokens_seen": 36170105088, + "step": 69000 + }, + { + "epoch": 0.6582634723398166, + "eval_loss": 2.050727128982544, + "eval_runtime": 79.5858, + "eval_samples_per_second": 62.825, + "eval_steps_per_second": 15.706, + "num_input_tokens_seen": 36170105088, + "step": 69000 + }, + { + "epoch": 0.6587404748560048, + "grad_norm": 0.13592034578323364, + "learning_rate": 0.001, + "loss": 2.1327, + "num_input_tokens_seen": 36196315744, + "step": 69050 + }, + { + "epoch": 0.6592174773721932, + "grad_norm": 0.14260712265968323, + "learning_rate": 0.001, + "loss": 2.1372, + "num_input_tokens_seen": 36222527072, + "step": 69100 + }, + { + "epoch": 0.6596944798883814, + "grad_norm": 0.15452657639980316, + "learning_rate": 0.001, + "loss": 2.1472, + "num_input_tokens_seen": 36248731872, + "step": 69150 + }, + { + "epoch": 0.6601714824045697, + "grad_norm": 0.1424446702003479, + "learning_rate": 0.001, + "loss": 2.1364, + "num_input_tokens_seen": 36274937600, + "step": 69200 + }, + { + "epoch": 0.660648484920758, + "grad_norm": 0.13716866075992584, + "learning_rate": 0.001, + "loss": 2.1312, + "num_input_tokens_seen": 36301149984, + "step": 69250 + }, + { + "epoch": 0.6611254874369462, + "grad_norm": 0.14809830486774445, + "learning_rate": 0.001, + "loss": 2.1404, + "num_input_tokens_seen": 36327350112, + "step": 69300 + }, + { + "epoch": 0.6616024899531345, + "grad_norm": 0.1415916383266449, + "learning_rate": 0.001, + "loss": 2.135, + "num_input_tokens_seen": 36353559776, + "step": 69350 + }, + { + "epoch": 0.6620794924693227, + "grad_norm": 0.14032308757305145, + "learning_rate": 0.001, + "loss": 2.1428, + "num_input_tokens_seen": 36379772736, + "step": 69400 + }, + { + "epoch": 0.6625564949855111, + "grad_norm": 0.14201749861240387, + "learning_rate": 0.001, + "loss": 2.1325, + "num_input_tokens_seen": 36405987136, + "step": 69450 + }, + { + "epoch": 0.6630334975016994, + "grad_norm": 0.14647674560546875, + "learning_rate": 0.001, + "loss": 2.1339, + "num_input_tokens_seen": 36432197248, + "step": 69500 + }, + { + "epoch": 0.6630334975016994, + "eval_loss": 2.050248384475708, + "eval_runtime": 79.7056, + "eval_samples_per_second": 62.731, + "eval_steps_per_second": 15.683, + "num_input_tokens_seen": 36432197248, + "step": 69500 + }, + { + "epoch": 0.6635105000178876, + "grad_norm": 0.1516619324684143, + "learning_rate": 0.001, + "loss": 2.1328, + "num_input_tokens_seen": 36458411168, + "step": 69550 + }, + { + "epoch": 0.6639875025340759, + "grad_norm": 0.1529875546693802, + "learning_rate": 0.001, + "loss": 2.1337, + "num_input_tokens_seen": 36484615936, + "step": 69600 + }, + { + "epoch": 0.6644645050502641, + "grad_norm": 0.14783112704753876, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 36510827520, + "step": 69650 + }, + { + "epoch": 0.6649415075664524, + "grad_norm": 0.1420241892337799, + "learning_rate": 0.001, + "loss": 2.1273, + "num_input_tokens_seen": 36537041920, + "step": 69700 + }, + { + "epoch": 0.6654185100826406, + "grad_norm": 0.15910027921199799, + "learning_rate": 0.001, + "loss": 2.1318, + "num_input_tokens_seen": 36563256224, + "step": 69750 + }, + { + "epoch": 0.665895512598829, + "grad_norm": 0.14894790947437286, + "learning_rate": 0.001, + "loss": 2.1453, + "num_input_tokens_seen": 36589470624, + "step": 69800 + }, + { + "epoch": 0.6663725151150173, + "grad_norm": 0.14341433346271515, + "learning_rate": 0.001, + "loss": 2.1341, + "num_input_tokens_seen": 36615684320, + "step": 69850 + }, + { + "epoch": 0.6668495176312055, + "grad_norm": 0.1415243148803711, + "learning_rate": 0.001, + "loss": 2.135, + "num_input_tokens_seen": 36641887808, + "step": 69900 + }, + { + "epoch": 0.6673265201473938, + "grad_norm": 0.1512666493654251, + "learning_rate": 0.001, + "loss": 2.128, + "num_input_tokens_seen": 36668096416, + "step": 69950 + }, + { + "epoch": 0.667803522663582, + "grad_norm": 0.1558062583208084, + "learning_rate": 0.001, + "loss": 2.118, + "num_input_tokens_seen": 36694303392, + "step": 70000 + }, + { + "epoch": 0.667803522663582, + "eval_loss": 2.050658941268921, + "eval_runtime": 80.2312, + "eval_samples_per_second": 62.32, + "eval_steps_per_second": 15.58, + "num_input_tokens_seen": 36694303392, + "step": 70000 + }, + { + "epoch": 0.6682805251797703, + "grad_norm": 0.14296776056289673, + "learning_rate": 0.001, + "loss": 2.1323, + "num_input_tokens_seen": 36720513888, + "step": 70050 + }, + { + "epoch": 0.6687575276959586, + "grad_norm": 0.15423347055912018, + "learning_rate": 0.001, + "loss": 2.132, + "num_input_tokens_seen": 36746724864, + "step": 70100 + }, + { + "epoch": 0.6692345302121468, + "grad_norm": 0.155342698097229, + "learning_rate": 0.001, + "loss": 2.1362, + "num_input_tokens_seen": 36772930208, + "step": 70150 + }, + { + "epoch": 0.6697115327283352, + "grad_norm": 0.14429853856563568, + "learning_rate": 0.001, + "loss": 2.1307, + "num_input_tokens_seen": 36799144608, + "step": 70200 + }, + { + "epoch": 0.6701885352445234, + "grad_norm": 0.14069730043411255, + "learning_rate": 0.001, + "loss": 2.1386, + "num_input_tokens_seen": 36825352256, + "step": 70250 + }, + { + "epoch": 0.6706655377607117, + "grad_norm": 0.158811554312706, + "learning_rate": 0.001, + "loss": 2.1258, + "num_input_tokens_seen": 36851566656, + "step": 70300 + }, + { + "epoch": 0.6711425402769, + "grad_norm": 0.13650204241275787, + "learning_rate": 0.001, + "loss": 2.1176, + "num_input_tokens_seen": 36877777984, + "step": 70350 + }, + { + "epoch": 0.6716195427930882, + "grad_norm": 0.1499445140361786, + "learning_rate": 0.001, + "loss": 2.1329, + "num_input_tokens_seen": 36903992384, + "step": 70400 + }, + { + "epoch": 0.6720965453092765, + "grad_norm": 0.162213996052742, + "learning_rate": 0.001, + "loss": 2.1241, + "num_input_tokens_seen": 36930204640, + "step": 70450 + }, + { + "epoch": 0.6725735478254647, + "grad_norm": 0.13957861065864563, + "learning_rate": 0.001, + "loss": 2.134, + "num_input_tokens_seen": 36956418880, + "step": 70500 + }, + { + "epoch": 0.6725735478254647, + "eval_loss": 2.0487587451934814, + "eval_runtime": 79.4988, + "eval_samples_per_second": 62.894, + "eval_steps_per_second": 15.724, + "num_input_tokens_seen": 36956418880, + "step": 70500 + }, + { + "epoch": 0.6730505503416531, + "grad_norm": 0.13786406815052032, + "learning_rate": 0.001, + "loss": 2.1295, + "num_input_tokens_seen": 36982628800, + "step": 70550 + }, + { + "epoch": 0.6735275528578413, + "grad_norm": 0.13988524675369263, + "learning_rate": 0.001, + "loss": 2.1399, + "num_input_tokens_seen": 37008836032, + "step": 70600 + }, + { + "epoch": 0.6740045553740296, + "grad_norm": 0.14156313240528107, + "learning_rate": 0.001, + "loss": 2.1389, + "num_input_tokens_seen": 37035035264, + "step": 70650 + }, + { + "epoch": 0.6744815578902179, + "grad_norm": 0.13705122470855713, + "learning_rate": 0.001, + "loss": 2.1327, + "num_input_tokens_seen": 37061248224, + "step": 70700 + }, + { + "epoch": 0.6749585604064061, + "grad_norm": 0.1541953831911087, + "learning_rate": 0.001, + "loss": 2.1386, + "num_input_tokens_seen": 37087461408, + "step": 70750 + }, + { + "epoch": 0.6754355629225944, + "grad_norm": 0.1509193331003189, + "learning_rate": 0.001, + "loss": 2.1368, + "num_input_tokens_seen": 37113675808, + "step": 70800 + }, + { + "epoch": 0.6759125654387826, + "grad_norm": 0.14552246034145355, + "learning_rate": 0.001, + "loss": 2.1467, + "num_input_tokens_seen": 37139883072, + "step": 70850 + }, + { + "epoch": 0.676389567954971, + "grad_norm": 0.1387251317501068, + "learning_rate": 0.001, + "loss": 2.1255, + "num_input_tokens_seen": 37166094176, + "step": 70900 + }, + { + "epoch": 0.6768665704711593, + "grad_norm": 0.15626934170722961, + "learning_rate": 0.001, + "loss": 2.1268, + "num_input_tokens_seen": 37192306208, + "step": 70950 + }, + { + "epoch": 0.6773435729873475, + "grad_norm": 0.14792028069496155, + "learning_rate": 0.001, + "loss": 2.1461, + "num_input_tokens_seen": 37218518048, + "step": 71000 + }, + { + "epoch": 0.6773435729873475, + "eval_loss": 2.049234390258789, + "eval_runtime": 80.3085, + "eval_samples_per_second": 62.26, + "eval_steps_per_second": 15.565, + "num_input_tokens_seen": 37218518048, + "step": 71000 + }, + { + "epoch": 0.6778205755035358, + "grad_norm": 0.16609162092208862, + "learning_rate": 0.001, + "loss": 2.1336, + "num_input_tokens_seen": 37244730912, + "step": 71050 + }, + { + "epoch": 0.678297578019724, + "grad_norm": 0.15477871894836426, + "learning_rate": 0.001, + "loss": 2.1432, + "num_input_tokens_seen": 37270939744, + "step": 71100 + }, + { + "epoch": 0.6787745805359123, + "grad_norm": 0.1837802231311798, + "learning_rate": 0.001, + "loss": 2.1372, + "num_input_tokens_seen": 37297151424, + "step": 71150 + }, + { + "epoch": 0.6792515830521006, + "grad_norm": 0.14492639899253845, + "learning_rate": 0.001, + "loss": 2.1495, + "num_input_tokens_seen": 37323357760, + "step": 71200 + }, + { + "epoch": 0.6797285855682889, + "grad_norm": 0.14435459673404694, + "learning_rate": 0.001, + "loss": 2.1208, + "num_input_tokens_seen": 37349568992, + "step": 71250 + }, + { + "epoch": 0.6802055880844772, + "grad_norm": 0.1369018405675888, + "learning_rate": 0.001, + "loss": 2.1294, + "num_input_tokens_seen": 37375769344, + "step": 71300 + }, + { + "epoch": 0.6806825906006654, + "grad_norm": 0.15272092819213867, + "learning_rate": 0.001, + "loss": 2.1416, + "num_input_tokens_seen": 37401976768, + "step": 71350 + }, + { + "epoch": 0.6811595931168537, + "grad_norm": 0.13770927488803864, + "learning_rate": 0.001, + "loss": 2.1229, + "num_input_tokens_seen": 37428186944, + "step": 71400 + }, + { + "epoch": 0.681636595633042, + "grad_norm": 0.13732831180095673, + "learning_rate": 0.001, + "loss": 2.1322, + "num_input_tokens_seen": 37454401344, + "step": 71450 + }, + { + "epoch": 0.6821135981492302, + "grad_norm": 0.14253439009189606, + "learning_rate": 0.001, + "loss": 2.1361, + "num_input_tokens_seen": 37480608672, + "step": 71500 + }, + { + "epoch": 0.6821135981492302, + "eval_loss": 2.047884464263916, + "eval_runtime": 80.2076, + "eval_samples_per_second": 62.338, + "eval_steps_per_second": 15.585, + "num_input_tokens_seen": 37480608672, + "step": 71500 + }, + { + "epoch": 0.6825906006654185, + "grad_norm": 0.14250271022319794, + "learning_rate": 0.001, + "loss": 2.1243, + "num_input_tokens_seen": 37506820512, + "step": 71550 + }, + { + "epoch": 0.6830676031816068, + "grad_norm": 0.14131279289722443, + "learning_rate": 0.001, + "loss": 2.1271, + "num_input_tokens_seen": 37533034912, + "step": 71600 + }, + { + "epoch": 0.6835446056977951, + "grad_norm": 0.1426624059677124, + "learning_rate": 0.001, + "loss": 2.1407, + "num_input_tokens_seen": 37559241280, + "step": 71650 + }, + { + "epoch": 0.6840216082139833, + "grad_norm": 0.15065455436706543, + "learning_rate": 0.001, + "loss": 2.1409, + "num_input_tokens_seen": 37585440352, + "step": 71700 + }, + { + "epoch": 0.6844986107301716, + "grad_norm": 0.15656264126300812, + "learning_rate": 0.001, + "loss": 2.1375, + "num_input_tokens_seen": 37611650048, + "step": 71750 + }, + { + "epoch": 0.6849756132463599, + "grad_norm": 0.15184299647808075, + "learning_rate": 0.001, + "loss": 2.1341, + "num_input_tokens_seen": 37637858912, + "step": 71800 + }, + { + "epoch": 0.6854526157625481, + "grad_norm": 0.14735595881938934, + "learning_rate": 0.001, + "loss": 2.1262, + "num_input_tokens_seen": 37664073312, + "step": 71850 + }, + { + "epoch": 0.6859296182787364, + "grad_norm": 0.13618548214435577, + "learning_rate": 0.001, + "loss": 2.1271, + "num_input_tokens_seen": 37690272480, + "step": 71900 + }, + { + "epoch": 0.6864066207949246, + "grad_norm": 0.15221554040908813, + "learning_rate": 0.001, + "loss": 2.1182, + "num_input_tokens_seen": 37716486080, + "step": 71950 + }, + { + "epoch": 0.686883623311113, + "grad_norm": 0.14794082939624786, + "learning_rate": 0.001, + "loss": 2.1369, + "num_input_tokens_seen": 37742700480, + "step": 72000 + }, + { + "epoch": 0.686883623311113, + "eval_loss": 2.0480294227600098, + "eval_runtime": 80.2069, + "eval_samples_per_second": 62.339, + "eval_steps_per_second": 15.585, + "num_input_tokens_seen": 37742700480, + "step": 72000 + }, + { + "epoch": 0.6873606258273013, + "grad_norm": 0.1561603844165802, + "learning_rate": 0.001, + "loss": 2.1356, + "num_input_tokens_seen": 37768907520, + "step": 72050 + }, + { + "epoch": 0.6878376283434895, + "grad_norm": 0.1416538655757904, + "learning_rate": 0.001, + "loss": 2.1286, + "num_input_tokens_seen": 37795120992, + "step": 72100 + }, + { + "epoch": 0.6883146308596778, + "grad_norm": 0.13913485407829285, + "learning_rate": 0.001, + "loss": 2.1363, + "num_input_tokens_seen": 37821335072, + "step": 72150 + }, + { + "epoch": 0.688791633375866, + "grad_norm": 0.14764831960201263, + "learning_rate": 0.001, + "loss": 2.1218, + "num_input_tokens_seen": 37847540704, + "step": 72200 + }, + { + "epoch": 0.6892686358920543, + "grad_norm": 0.1435699462890625, + "learning_rate": 0.001, + "loss": 2.1382, + "num_input_tokens_seen": 37873742272, + "step": 72250 + }, + { + "epoch": 0.6897456384082427, + "grad_norm": 0.13604077696800232, + "learning_rate": 0.001, + "loss": 2.1406, + "num_input_tokens_seen": 37899949728, + "step": 72300 + }, + { + "epoch": 0.6902226409244309, + "grad_norm": 0.1389516144990921, + "learning_rate": 0.001, + "loss": 2.1345, + "num_input_tokens_seen": 37926155104, + "step": 72350 + }, + { + "epoch": 0.6906996434406192, + "grad_norm": 0.15023711323738098, + "learning_rate": 0.001, + "loss": 2.122, + "num_input_tokens_seen": 37952368992, + "step": 72400 + }, + { + "epoch": 0.6911766459568074, + "grad_norm": 0.1581972986459732, + "learning_rate": 0.001, + "loss": 2.1381, + "num_input_tokens_seen": 37978578048, + "step": 72450 + }, + { + "epoch": 0.6916536484729957, + "grad_norm": 0.1558607965707779, + "learning_rate": 0.001, + "loss": 2.1369, + "num_input_tokens_seen": 38004789088, + "step": 72500 + }, + { + "epoch": 0.6916536484729957, + "eval_loss": 2.0476512908935547, + "eval_runtime": 80.2302, + "eval_samples_per_second": 62.321, + "eval_steps_per_second": 15.58, + "num_input_tokens_seen": 38004789088, + "step": 72500 + }, + { + "epoch": 0.6921306509891839, + "grad_norm": 0.15057148039340973, + "learning_rate": 0.001, + "loss": 2.1315, + "num_input_tokens_seen": 38030998464, + "step": 72550 + }, + { + "epoch": 0.6926076535053722, + "grad_norm": 0.1446782648563385, + "learning_rate": 0.001, + "loss": 2.1385, + "num_input_tokens_seen": 38057210368, + "step": 72600 + }, + { + "epoch": 0.6930846560215606, + "grad_norm": 0.15265704691410065, + "learning_rate": 0.001, + "loss": 2.1313, + "num_input_tokens_seen": 38083424608, + "step": 72650 + }, + { + "epoch": 0.6935616585377488, + "grad_norm": 0.15340076386928558, + "learning_rate": 0.001, + "loss": 2.1335, + "num_input_tokens_seen": 38109639008, + "step": 72700 + }, + { + "epoch": 0.6940386610539371, + "grad_norm": 0.15052905678749084, + "learning_rate": 0.001, + "loss": 2.1291, + "num_input_tokens_seen": 38135849184, + "step": 72750 + }, + { + "epoch": 0.6945156635701253, + "grad_norm": 0.1488681584596634, + "learning_rate": 0.001, + "loss": 2.1269, + "num_input_tokens_seen": 38162055520, + "step": 72800 + }, + { + "epoch": 0.6949926660863136, + "grad_norm": 0.14417092502117157, + "learning_rate": 0.001, + "loss": 2.1291, + "num_input_tokens_seen": 38188267136, + "step": 72850 + }, + { + "epoch": 0.6954696686025019, + "grad_norm": 0.14182248711585999, + "learning_rate": 0.001, + "loss": 2.1369, + "num_input_tokens_seen": 38214476512, + "step": 72900 + }, + { + "epoch": 0.6959466711186901, + "grad_norm": 0.13842789828777313, + "learning_rate": 0.001, + "loss": 2.1487, + "num_input_tokens_seen": 38240668640, + "step": 72950 + }, + { + "epoch": 0.6964236736348784, + "grad_norm": 0.13514494895935059, + "learning_rate": 0.001, + "loss": 2.1379, + "num_input_tokens_seen": 38266876896, + "step": 73000 + }, + { + "epoch": 0.6964236736348784, + "eval_loss": 2.0458996295928955, + "eval_runtime": 80.4905, + "eval_samples_per_second": 62.119, + "eval_steps_per_second": 15.53, + "num_input_tokens_seen": 38266876896, + "step": 73000 + }, + { + "epoch": 0.6969006761510667, + "grad_norm": 0.14907589554786682, + "learning_rate": 0.001, + "loss": 2.118, + "num_input_tokens_seen": 38293080608, + "step": 73050 + }, + { + "epoch": 0.697377678667255, + "grad_norm": 0.1431187242269516, + "learning_rate": 0.001, + "loss": 2.1337, + "num_input_tokens_seen": 38319293216, + "step": 73100 + }, + { + "epoch": 0.6978546811834433, + "grad_norm": 0.15081514418125153, + "learning_rate": 0.001, + "loss": 2.1272, + "num_input_tokens_seen": 38345504128, + "step": 73150 + }, + { + "epoch": 0.6983316836996315, + "grad_norm": 0.16645316779613495, + "learning_rate": 0.001, + "loss": 2.1272, + "num_input_tokens_seen": 38371716608, + "step": 73200 + }, + { + "epoch": 0.6988086862158198, + "grad_norm": 0.13646705448627472, + "learning_rate": 0.001, + "loss": 2.1267, + "num_input_tokens_seen": 38397931008, + "step": 73250 + }, + { + "epoch": 0.699285688732008, + "grad_norm": 0.147465780377388, + "learning_rate": 0.001, + "loss": 2.1372, + "num_input_tokens_seen": 38424145312, + "step": 73300 + }, + { + "epoch": 0.6997626912481963, + "grad_norm": 0.15060865879058838, + "learning_rate": 0.001, + "loss": 2.1253, + "num_input_tokens_seen": 38450358240, + "step": 73350 + }, + { + "epoch": 0.7002396937643846, + "grad_norm": 0.14528082311153412, + "learning_rate": 0.001, + "loss": 2.1285, + "num_input_tokens_seen": 38476567968, + "step": 73400 + }, + { + "epoch": 0.7007166962805729, + "grad_norm": 0.15923307836055756, + "learning_rate": 0.001, + "loss": 2.1345, + "num_input_tokens_seen": 38502782336, + "step": 73450 + }, + { + "epoch": 0.7011936987967612, + "grad_norm": 0.1410328447818756, + "learning_rate": 0.001, + "loss": 2.1235, + "num_input_tokens_seen": 38528990144, + "step": 73500 + }, + { + "epoch": 0.7011936987967612, + "eval_loss": 2.045837163925171, + "eval_runtime": 79.7863, + "eval_samples_per_second": 62.667, + "eval_steps_per_second": 15.667, + "num_input_tokens_seen": 38528990144, + "step": 73500 + }, + { + "epoch": 0.7016707013129494, + "grad_norm": 0.13489565253257751, + "learning_rate": 0.001, + "loss": 2.1331, + "num_input_tokens_seen": 38555187872, + "step": 73550 + }, + { + "epoch": 0.7021477038291377, + "grad_norm": 0.15437881648540497, + "learning_rate": 0.001, + "loss": 2.1341, + "num_input_tokens_seen": 38581402272, + "step": 73600 + }, + { + "epoch": 0.7026247063453259, + "grad_norm": 0.1449405401945114, + "learning_rate": 0.001, + "loss": 2.1244, + "num_input_tokens_seen": 38607616672, + "step": 73650 + }, + { + "epoch": 0.7031017088615142, + "grad_norm": 0.1428922414779663, + "learning_rate": 0.001, + "loss": 2.1309, + "num_input_tokens_seen": 38633831072, + "step": 73700 + }, + { + "epoch": 0.7035787113777026, + "grad_norm": 0.14642202854156494, + "learning_rate": 0.001, + "loss": 2.1343, + "num_input_tokens_seen": 38660044448, + "step": 73750 + }, + { + "epoch": 0.7040557138938908, + "grad_norm": 0.138772115111351, + "learning_rate": 0.001, + "loss": 2.1299, + "num_input_tokens_seen": 38686253024, + "step": 73800 + }, + { + "epoch": 0.7045327164100791, + "grad_norm": 0.14213278889656067, + "learning_rate": 0.001, + "loss": 2.1238, + "num_input_tokens_seen": 38712461152, + "step": 73850 + }, + { + "epoch": 0.7050097189262673, + "grad_norm": 0.13823473453521729, + "learning_rate": 0.001, + "loss": 2.1197, + "num_input_tokens_seen": 38738675552, + "step": 73900 + }, + { + "epoch": 0.7054867214424556, + "grad_norm": 0.14536434412002563, + "learning_rate": 0.001, + "loss": 2.1276, + "num_input_tokens_seen": 38764889952, + "step": 73950 + }, + { + "epoch": 0.7059637239586439, + "grad_norm": 0.1466161459684372, + "learning_rate": 0.001, + "loss": 2.127, + "num_input_tokens_seen": 38791103008, + "step": 74000 + }, + { + "epoch": 0.7059637239586439, + "eval_loss": 2.045396566390991, + "eval_runtime": 79.9741, + "eval_samples_per_second": 62.52, + "eval_steps_per_second": 15.63, + "num_input_tokens_seen": 38791103008, + "step": 74000 + }, + { + "epoch": 0.7064407264748321, + "grad_norm": 0.13936443626880646, + "learning_rate": 0.001, + "loss": 2.1354, + "num_input_tokens_seen": 38817315392, + "step": 74050 + }, + { + "epoch": 0.7069177289910205, + "grad_norm": 0.14214852452278137, + "learning_rate": 0.001, + "loss": 2.1254, + "num_input_tokens_seen": 38843526016, + "step": 74100 + }, + { + "epoch": 0.7073947315072087, + "grad_norm": 0.133310005068779, + "learning_rate": 0.001, + "loss": 2.1199, + "num_input_tokens_seen": 38869733760, + "step": 74150 + }, + { + "epoch": 0.707871734023397, + "grad_norm": 0.1473781317472458, + "learning_rate": 0.001, + "loss": 2.1187, + "num_input_tokens_seen": 38895948160, + "step": 74200 + }, + { + "epoch": 0.7083487365395853, + "grad_norm": 0.15462498366832733, + "learning_rate": 0.001, + "loss": 2.1318, + "num_input_tokens_seen": 38922156384, + "step": 74250 + }, + { + "epoch": 0.7088257390557735, + "grad_norm": 0.13819323480129242, + "learning_rate": 0.001, + "loss": 2.1345, + "num_input_tokens_seen": 38948364896, + "step": 74300 + }, + { + "epoch": 0.7093027415719618, + "grad_norm": 0.14366789162158966, + "learning_rate": 0.001, + "loss": 2.1295, + "num_input_tokens_seen": 38974568576, + "step": 74350 + }, + { + "epoch": 0.70977974408815, + "grad_norm": 0.1546156257390976, + "learning_rate": 0.001, + "loss": 2.1398, + "num_input_tokens_seen": 39000771968, + "step": 74400 + }, + { + "epoch": 0.7102567466043384, + "grad_norm": 0.14302626252174377, + "learning_rate": 0.001, + "loss": 2.1276, + "num_input_tokens_seen": 39026975968, + "step": 74450 + }, + { + "epoch": 0.7107337491205266, + "grad_norm": 0.14276665449142456, + "learning_rate": 0.001, + "loss": 2.1362, + "num_input_tokens_seen": 39053184384, + "step": 74500 + }, + { + "epoch": 0.7107337491205266, + "eval_loss": 2.0453200340270996, + "eval_runtime": 80.1033, + "eval_samples_per_second": 62.419, + "eval_steps_per_second": 15.605, + "num_input_tokens_seen": 39053184384, + "step": 74500 + }, + { + "epoch": 0.7112107516367149, + "grad_norm": 0.15070320665836334, + "learning_rate": 0.001, + "loss": 2.1197, + "num_input_tokens_seen": 39079385760, + "step": 74550 + }, + { + "epoch": 0.7116877541529032, + "grad_norm": 0.14792390167713165, + "learning_rate": 0.001, + "loss": 2.1332, + "num_input_tokens_seen": 39105597568, + "step": 74600 + }, + { + "epoch": 0.7121647566690914, + "grad_norm": 0.15546678006649017, + "learning_rate": 0.001, + "loss": 2.1315, + "num_input_tokens_seen": 39131798528, + "step": 74650 + }, + { + "epoch": 0.7126417591852797, + "grad_norm": 0.15446694195270538, + "learning_rate": 0.001, + "loss": 2.1346, + "num_input_tokens_seen": 39158006240, + "step": 74700 + }, + { + "epoch": 0.7131187617014679, + "grad_norm": 0.14489658176898956, + "learning_rate": 0.001, + "loss": 2.1217, + "num_input_tokens_seen": 39184214656, + "step": 74750 + }, + { + "epoch": 0.7135957642176562, + "grad_norm": 0.14391835033893585, + "learning_rate": 0.001, + "loss": 2.1318, + "num_input_tokens_seen": 39210425120, + "step": 74800 + }, + { + "epoch": 0.7140727667338446, + "grad_norm": 0.1562168151140213, + "learning_rate": 0.001, + "loss": 2.1367, + "num_input_tokens_seen": 39236633824, + "step": 74850 + }, + { + "epoch": 0.7145497692500328, + "grad_norm": 0.14505062997341156, + "learning_rate": 0.001, + "loss": 2.126, + "num_input_tokens_seen": 39262845056, + "step": 74900 + }, + { + "epoch": 0.7150267717662211, + "grad_norm": 0.17240794003009796, + "learning_rate": 0.001, + "loss": 2.1316, + "num_input_tokens_seen": 39289059456, + "step": 74950 + }, + { + "epoch": 0.7155037742824093, + "grad_norm": 0.14480435848236084, + "learning_rate": 0.001, + "loss": 2.1096, + "num_input_tokens_seen": 39315259072, + "step": 75000 + }, + { + "epoch": 0.7155037742824093, + "eval_loss": 2.0447704792022705, + "eval_runtime": 80.0622, + "eval_samples_per_second": 62.451, + "eval_steps_per_second": 15.613, + "num_input_tokens_seen": 39315259072, + "step": 75000 + }, + { + "epoch": 0.7159807767985976, + "grad_norm": 0.15248794853687286, + "learning_rate": 0.001, + "loss": 2.1311, + "num_input_tokens_seen": 39341461440, + "step": 75050 + }, + { + "epoch": 0.7164577793147859, + "grad_norm": 0.13991257548332214, + "learning_rate": 0.001, + "loss": 2.142, + "num_input_tokens_seen": 39367670304, + "step": 75100 + }, + { + "epoch": 0.7169347818309741, + "grad_norm": 0.14684896171092987, + "learning_rate": 0.001, + "loss": 2.131, + "num_input_tokens_seen": 39393880928, + "step": 75150 + }, + { + "epoch": 0.7174117843471625, + "grad_norm": 0.14778843522071838, + "learning_rate": 0.001, + "loss": 2.1185, + "num_input_tokens_seen": 39420094080, + "step": 75200 + }, + { + "epoch": 0.7178887868633507, + "grad_norm": 0.14234404265880585, + "learning_rate": 0.001, + "loss": 2.1172, + "num_input_tokens_seen": 39446305376, + "step": 75250 + }, + { + "epoch": 0.718365789379539, + "grad_norm": 0.1400527060031891, + "learning_rate": 0.001, + "loss": 2.1333, + "num_input_tokens_seen": 39472510304, + "step": 75300 + }, + { + "epoch": 0.7188427918957272, + "grad_norm": 0.14747172594070435, + "learning_rate": 0.001, + "loss": 2.1369, + "num_input_tokens_seen": 39498722784, + "step": 75350 + }, + { + "epoch": 0.7193197944119155, + "grad_norm": 0.14818298816680908, + "learning_rate": 0.001, + "loss": 2.1375, + "num_input_tokens_seen": 39524932416, + "step": 75400 + }, + { + "epoch": 0.7197967969281038, + "grad_norm": 0.14356687664985657, + "learning_rate": 0.001, + "loss": 2.1334, + "num_input_tokens_seen": 39551136768, + "step": 75450 + }, + { + "epoch": 0.720273799444292, + "grad_norm": 0.12981760501861572, + "learning_rate": 0.001, + "loss": 2.1273, + "num_input_tokens_seen": 39577349376, + "step": 75500 + }, + { + "epoch": 0.720273799444292, + "eval_loss": 2.0432538986206055, + "eval_runtime": 80.0711, + "eval_samples_per_second": 62.445, + "eval_steps_per_second": 15.611, + "num_input_tokens_seen": 39577349376, + "step": 75500 + }, + { + "epoch": 0.7207508019604804, + "grad_norm": 0.14318658411502838, + "learning_rate": 0.001, + "loss": 2.1299, + "num_input_tokens_seen": 39603560448, + "step": 75550 + }, + { + "epoch": 0.7212278044766686, + "grad_norm": 0.1411541849374771, + "learning_rate": 0.001, + "loss": 2.1298, + "num_input_tokens_seen": 39629764544, + "step": 75600 + }, + { + "epoch": 0.7217048069928569, + "grad_norm": 0.15290595591068268, + "learning_rate": 0.001, + "loss": 2.1194, + "num_input_tokens_seen": 39655978944, + "step": 75650 + }, + { + "epoch": 0.7221818095090452, + "grad_norm": 0.15146586298942566, + "learning_rate": 0.001, + "loss": 2.1226, + "num_input_tokens_seen": 39682192608, + "step": 75700 + }, + { + "epoch": 0.7226588120252334, + "grad_norm": 0.1520843505859375, + "learning_rate": 0.001, + "loss": 2.1375, + "num_input_tokens_seen": 39708395360, + "step": 75750 + }, + { + "epoch": 0.7231358145414217, + "grad_norm": 0.14785976707935333, + "learning_rate": 0.001, + "loss": 2.1279, + "num_input_tokens_seen": 39734607232, + "step": 75800 + }, + { + "epoch": 0.7236128170576099, + "grad_norm": 0.1379549652338028, + "learning_rate": 0.001, + "loss": 2.1234, + "num_input_tokens_seen": 39760818368, + "step": 75850 + }, + { + "epoch": 0.7240898195737983, + "grad_norm": 0.15331624448299408, + "learning_rate": 0.001, + "loss": 2.1288, + "num_input_tokens_seen": 39787032768, + "step": 75900 + }, + { + "epoch": 0.7245668220899866, + "grad_norm": 0.14259029924869537, + "learning_rate": 0.001, + "loss": 2.123, + "num_input_tokens_seen": 39813244832, + "step": 75950 + }, + { + "epoch": 0.7250438246061748, + "grad_norm": 0.14519137144088745, + "learning_rate": 0.001, + "loss": 2.1325, + "num_input_tokens_seen": 39839459232, + "step": 76000 + }, + { + "epoch": 0.7250438246061748, + "eval_loss": 2.043282985687256, + "eval_runtime": 80.639, + "eval_samples_per_second": 62.005, + "eval_steps_per_second": 15.501, + "num_input_tokens_seen": 39839459232, + "step": 76000 + }, + { + "epoch": 0.7255208271223631, + "grad_norm": 0.14390450716018677, + "learning_rate": 0.001, + "loss": 2.1264, + "num_input_tokens_seen": 39865673408, + "step": 76050 + }, + { + "epoch": 0.7259978296385513, + "grad_norm": 0.14975307881832123, + "learning_rate": 0.001, + "loss": 2.1256, + "num_input_tokens_seen": 39891887808, + "step": 76100 + }, + { + "epoch": 0.7264748321547396, + "grad_norm": 0.1487993597984314, + "learning_rate": 0.001, + "loss": 2.1181, + "num_input_tokens_seen": 39918100800, + "step": 76150 + }, + { + "epoch": 0.7269518346709279, + "grad_norm": 0.13411827385425568, + "learning_rate": 0.001, + "loss": 2.1253, + "num_input_tokens_seen": 39944315200, + "step": 76200 + }, + { + "epoch": 0.7274288371871162, + "grad_norm": 0.14648018777370453, + "learning_rate": 0.001, + "loss": 2.1212, + "num_input_tokens_seen": 39970507840, + "step": 76250 + }, + { + "epoch": 0.7279058397033045, + "grad_norm": 0.14438115060329437, + "learning_rate": 0.001, + "loss": 2.124, + "num_input_tokens_seen": 39996717120, + "step": 76300 + }, + { + "epoch": 0.7283828422194927, + "grad_norm": 0.1373198926448822, + "learning_rate": 0.001, + "loss": 2.1333, + "num_input_tokens_seen": 40022924960, + "step": 76350 + }, + { + "epoch": 0.728859844735681, + "grad_norm": 0.1409999579191208, + "learning_rate": 0.001, + "loss": 2.1269, + "num_input_tokens_seen": 40049132160, + "step": 76400 + }, + { + "epoch": 0.7293368472518692, + "grad_norm": 0.15943694114685059, + "learning_rate": 0.001, + "loss": 2.1236, + "num_input_tokens_seen": 40075334464, + "step": 76450 + }, + { + "epoch": 0.7298138497680575, + "grad_norm": 0.14787088334560394, + "learning_rate": 0.001, + "loss": 2.1261, + "num_input_tokens_seen": 40101539296, + "step": 76500 + }, + { + "epoch": 0.7298138497680575, + "eval_loss": 2.042207717895508, + "eval_runtime": 80.2748, + "eval_samples_per_second": 62.286, + "eval_steps_per_second": 15.572, + "num_input_tokens_seen": 40101539296, + "step": 76500 + }, + { + "epoch": 0.7302908522842458, + "grad_norm": 0.15151502192020416, + "learning_rate": 0.001, + "loss": 2.1314, + "num_input_tokens_seen": 40127751296, + "step": 76550 + }, + { + "epoch": 0.730767854800434, + "grad_norm": 0.13959145545959473, + "learning_rate": 0.001, + "loss": 2.1249, + "num_input_tokens_seen": 40153964320, + "step": 76600 + }, + { + "epoch": 0.7312448573166224, + "grad_norm": 0.13703158497810364, + "learning_rate": 0.001, + "loss": 2.1347, + "num_input_tokens_seen": 40180178176, + "step": 76650 + }, + { + "epoch": 0.7317218598328106, + "grad_norm": 0.1381351500749588, + "learning_rate": 0.001, + "loss": 2.1268, + "num_input_tokens_seen": 40206390688, + "step": 76700 + }, + { + "epoch": 0.7321988623489989, + "grad_norm": 0.14056669175624847, + "learning_rate": 0.001, + "loss": 2.1174, + "num_input_tokens_seen": 40232601472, + "step": 76750 + }, + { + "epoch": 0.7326758648651872, + "grad_norm": 0.1344117820262909, + "learning_rate": 0.001, + "loss": 2.1319, + "num_input_tokens_seen": 40258813184, + "step": 76800 + }, + { + "epoch": 0.7331528673813754, + "grad_norm": 0.14882655441761017, + "learning_rate": 0.001, + "loss": 2.1361, + "num_input_tokens_seen": 40285020800, + "step": 76850 + }, + { + "epoch": 0.7336298698975637, + "grad_norm": 0.1313314437866211, + "learning_rate": 0.001, + "loss": 2.1244, + "num_input_tokens_seen": 40311227808, + "step": 76900 + }, + { + "epoch": 0.7341068724137519, + "grad_norm": 0.13959497213363647, + "learning_rate": 0.001, + "loss": 2.1233, + "num_input_tokens_seen": 40337436928, + "step": 76950 + }, + { + "epoch": 0.7345838749299403, + "grad_norm": 0.16767309606075287, + "learning_rate": 0.001, + "loss": 2.1284, + "num_input_tokens_seen": 40363651328, + "step": 77000 + }, + { + "epoch": 0.7345838749299403, + "eval_loss": 2.041841983795166, + "eval_runtime": 80.626, + "eval_samples_per_second": 62.015, + "eval_steps_per_second": 15.504, + "num_input_tokens_seen": 40363651328, + "step": 77000 + }, + { + "epoch": 0.7350608774461286, + "grad_norm": 0.14556396007537842, + "learning_rate": 0.001, + "loss": 2.1383, + "num_input_tokens_seen": 40389861376, + "step": 77050 + }, + { + "epoch": 0.7355378799623168, + "grad_norm": 0.16213080286979675, + "learning_rate": 0.001, + "loss": 2.1288, + "num_input_tokens_seen": 40416069344, + "step": 77100 + }, + { + "epoch": 0.7360148824785051, + "grad_norm": 0.15535910427570343, + "learning_rate": 0.001, + "loss": 2.1311, + "num_input_tokens_seen": 40442276800, + "step": 77150 + }, + { + "epoch": 0.7364918849946933, + "grad_norm": 0.14690810441970825, + "learning_rate": 0.001, + "loss": 2.1303, + "num_input_tokens_seen": 40468487520, + "step": 77200 + }, + { + "epoch": 0.7369688875108816, + "grad_norm": 0.1359778791666031, + "learning_rate": 0.001, + "loss": 2.1293, + "num_input_tokens_seen": 40494701920, + "step": 77250 + }, + { + "epoch": 0.7374458900270698, + "grad_norm": 0.1551726907491684, + "learning_rate": 0.001, + "loss": 2.1319, + "num_input_tokens_seen": 40520907008, + "step": 77300 + }, + { + "epoch": 0.7379228925432582, + "grad_norm": 0.1439419388771057, + "learning_rate": 0.001, + "loss": 2.1217, + "num_input_tokens_seen": 40547111296, + "step": 77350 + }, + { + "epoch": 0.7383998950594465, + "grad_norm": 0.15661780536174774, + "learning_rate": 0.001, + "loss": 2.1218, + "num_input_tokens_seen": 40573317376, + "step": 77400 + }, + { + "epoch": 0.7388768975756347, + "grad_norm": 0.15002021193504333, + "learning_rate": 0.001, + "loss": 2.127, + "num_input_tokens_seen": 40599530048, + "step": 77450 + }, + { + "epoch": 0.739353900091823, + "grad_norm": 0.148692324757576, + "learning_rate": 0.001, + "loss": 2.1152, + "num_input_tokens_seen": 40625741536, + "step": 77500 + }, + { + "epoch": 0.739353900091823, + "eval_loss": 2.0417163372039795, + "eval_runtime": 80.5896, + "eval_samples_per_second": 62.043, + "eval_steps_per_second": 15.511, + "num_input_tokens_seen": 40625741536, + "step": 77500 + }, + { + "epoch": 0.7398309026080112, + "grad_norm": 0.15267367660999298, + "learning_rate": 0.001, + "loss": 2.1358, + "num_input_tokens_seen": 40651948224, + "step": 77550 + }, + { + "epoch": 0.7403079051241995, + "grad_norm": 0.14535802602767944, + "learning_rate": 0.001, + "loss": 2.1194, + "num_input_tokens_seen": 40678158304, + "step": 77600 + }, + { + "epoch": 0.7407849076403878, + "grad_norm": 0.14069873094558716, + "learning_rate": 0.001, + "loss": 2.1254, + "num_input_tokens_seen": 40704364160, + "step": 77650 + }, + { + "epoch": 0.741261910156576, + "grad_norm": 0.15348641574382782, + "learning_rate": 0.001, + "loss": 2.1227, + "num_input_tokens_seen": 40730574080, + "step": 77700 + }, + { + "epoch": 0.7417389126727644, + "grad_norm": 0.13605211675167084, + "learning_rate": 0.001, + "loss": 2.1319, + "num_input_tokens_seen": 40756787808, + "step": 77750 + }, + { + "epoch": 0.7422159151889526, + "grad_norm": 0.14588411152362823, + "learning_rate": 0.001, + "loss": 2.17, + "num_input_tokens_seen": 40782995104, + "step": 77800 + }, + { + "epoch": 0.7426929177051409, + "grad_norm": 0.14045777916908264, + "learning_rate": 0.001, + "loss": 2.1396, + "num_input_tokens_seen": 40809206464, + "step": 77850 + }, + { + "epoch": 0.7431699202213292, + "grad_norm": 0.1325819045305252, + "learning_rate": 0.001, + "loss": 2.1301, + "num_input_tokens_seen": 40835418400, + "step": 77900 + }, + { + "epoch": 0.7436469227375174, + "grad_norm": 0.14319738745689392, + "learning_rate": 0.001, + "loss": 2.1249, + "num_input_tokens_seen": 40861629600, + "step": 77950 + }, + { + "epoch": 0.7441239252537057, + "grad_norm": 0.12736602127552032, + "learning_rate": 0.001, + "loss": 2.1353, + "num_input_tokens_seen": 40887844000, + "step": 78000 + }, + { + "epoch": 0.7441239252537057, + "eval_loss": 2.0413477420806885, + "eval_runtime": 80.6499, + "eval_samples_per_second": 61.996, + "eval_steps_per_second": 15.499, + "num_input_tokens_seen": 40887844000, + "step": 78000 + }, + { + "epoch": 0.744600927769894, + "grad_norm": 0.14694809913635254, + "learning_rate": 0.001, + "loss": 2.1279, + "num_input_tokens_seen": 40914051584, + "step": 78050 + }, + { + "epoch": 0.7450779302860823, + "grad_norm": 0.13846631348133087, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 40940260192, + "step": 78100 + }, + { + "epoch": 0.7455549328022706, + "grad_norm": 0.1346752941608429, + "learning_rate": 0.001, + "loss": 2.1205, + "num_input_tokens_seen": 40966469856, + "step": 78150 + }, + { + "epoch": 0.7460319353184588, + "grad_norm": 0.14688965678215027, + "learning_rate": 0.001, + "loss": 2.1268, + "num_input_tokens_seen": 40992684256, + "step": 78200 + }, + { + "epoch": 0.7465089378346471, + "grad_norm": 0.13972339034080505, + "learning_rate": 0.001, + "loss": 2.117, + "num_input_tokens_seen": 41018896352, + "step": 78250 + }, + { + "epoch": 0.7469859403508353, + "grad_norm": 0.13046054542064667, + "learning_rate": 0.001, + "loss": 2.1325, + "num_input_tokens_seen": 41045100768, + "step": 78300 + }, + { + "epoch": 0.7474629428670236, + "grad_norm": 0.14544983208179474, + "learning_rate": 0.001, + "loss": 2.1199, + "num_input_tokens_seen": 41071312032, + "step": 78350 + }, + { + "epoch": 0.7479399453832118, + "grad_norm": 0.13829651474952698, + "learning_rate": 0.001, + "loss": 2.1233, + "num_input_tokens_seen": 41097519104, + "step": 78400 + }, + { + "epoch": 0.7484169478994002, + "grad_norm": 0.13015754520893097, + "learning_rate": 0.001, + "loss": 2.1273, + "num_input_tokens_seen": 41123726848, + "step": 78450 + }, + { + "epoch": 0.7488939504155885, + "grad_norm": 0.15603971481323242, + "learning_rate": 0.001, + "loss": 2.1214, + "num_input_tokens_seen": 41149941248, + "step": 78500 + }, + { + "epoch": 0.7488939504155885, + "eval_loss": 2.0389957427978516, + "eval_runtime": 80.3381, + "eval_samples_per_second": 62.237, + "eval_steps_per_second": 15.559, + "num_input_tokens_seen": 41149941248, + "step": 78500 + }, + { + "epoch": 0.7493709529317767, + "grad_norm": 0.142806738615036, + "learning_rate": 0.001, + "loss": 2.1262, + "num_input_tokens_seen": 41176154656, + "step": 78550 + }, + { + "epoch": 0.749847955447965, + "grad_norm": 0.12840932607650757, + "learning_rate": 0.001, + "loss": 2.1264, + "num_input_tokens_seen": 41202368992, + "step": 78600 + }, + { + "epoch": 0.7503249579641532, + "grad_norm": 0.14436882734298706, + "learning_rate": 0.001, + "loss": 2.1164, + "num_input_tokens_seen": 41228583392, + "step": 78650 + }, + { + "epoch": 0.7508019604803415, + "grad_norm": 0.14413060247898102, + "learning_rate": 0.001, + "loss": 2.1365, + "num_input_tokens_seen": 41254796832, + "step": 78700 + }, + { + "epoch": 0.7512789629965299, + "grad_norm": 0.14348316192626953, + "learning_rate": 0.001, + "loss": 2.1297, + "num_input_tokens_seen": 41281008000, + "step": 78750 + }, + { + "epoch": 0.7517559655127181, + "grad_norm": 0.1614920049905777, + "learning_rate": 0.001, + "loss": 2.1274, + "num_input_tokens_seen": 41307217120, + "step": 78800 + }, + { + "epoch": 0.7522329680289064, + "grad_norm": 0.13642476499080658, + "learning_rate": 0.001, + "loss": 2.1198, + "num_input_tokens_seen": 41333429376, + "step": 78850 + }, + { + "epoch": 0.7527099705450946, + "grad_norm": 0.13858525454998016, + "learning_rate": 0.001, + "loss": 2.1142, + "num_input_tokens_seen": 41359629152, + "step": 78900 + }, + { + "epoch": 0.7531869730612829, + "grad_norm": 0.1430158019065857, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 41385843552, + "step": 78950 + }, + { + "epoch": 0.7536639755774712, + "grad_norm": 0.14330270886421204, + "learning_rate": 0.001, + "loss": 2.1184, + "num_input_tokens_seen": 41412054816, + "step": 79000 + }, + { + "epoch": 0.7536639755774712, + "eval_loss": 2.039459705352783, + "eval_runtime": 79.6308, + "eval_samples_per_second": 62.79, + "eval_steps_per_second": 15.697, + "num_input_tokens_seen": 41412054816, + "step": 79000 + }, + { + "epoch": 0.7541409780936594, + "grad_norm": 0.15824219584465027, + "learning_rate": 0.001, + "loss": 2.1102, + "num_input_tokens_seen": 41438265536, + "step": 79050 + }, + { + "epoch": 0.7546179806098477, + "grad_norm": 0.14686284959316254, + "learning_rate": 0.001, + "loss": 2.1192, + "num_input_tokens_seen": 41464479360, + "step": 79100 + }, + { + "epoch": 0.755094983126036, + "grad_norm": 0.15335090458393097, + "learning_rate": 0.001, + "loss": 2.127, + "num_input_tokens_seen": 41490678464, + "step": 79150 + }, + { + "epoch": 0.7555719856422243, + "grad_norm": 0.15127016603946686, + "learning_rate": 0.001, + "loss": 2.1114, + "num_input_tokens_seen": 41516875424, + "step": 79200 + }, + { + "epoch": 0.7560489881584125, + "grad_norm": 0.14542415738105774, + "learning_rate": 0.001, + "loss": 2.1244, + "num_input_tokens_seen": 41543082560, + "step": 79250 + }, + { + "epoch": 0.7565259906746008, + "grad_norm": 0.1413310468196869, + "learning_rate": 0.001, + "loss": 2.1196, + "num_input_tokens_seen": 41569295104, + "step": 79300 + }, + { + "epoch": 0.7570029931907891, + "grad_norm": 0.15854205191135406, + "learning_rate": 0.001, + "loss": 2.1185, + "num_input_tokens_seen": 41595508128, + "step": 79350 + }, + { + "epoch": 0.7574799957069773, + "grad_norm": 0.14724323153495789, + "learning_rate": 0.001, + "loss": 2.1244, + "num_input_tokens_seen": 41621721792, + "step": 79400 + }, + { + "epoch": 0.7579569982231656, + "grad_norm": 0.139704167842865, + "learning_rate": 0.001, + "loss": 2.1359, + "num_input_tokens_seen": 41647928864, + "step": 79450 + }, + { + "epoch": 0.7584340007393539, + "grad_norm": 0.14574068784713745, + "learning_rate": 0.001, + "loss": 2.1199, + "num_input_tokens_seen": 41674139648, + "step": 79500 + }, + { + "epoch": 0.7584340007393539, + "eval_loss": 2.0389485359191895, + "eval_runtime": 80.0866, + "eval_samples_per_second": 62.432, + "eval_steps_per_second": 15.608, + "num_input_tokens_seen": 41674139648, + "step": 79500 + }, + { + "epoch": 0.7589110032555422, + "grad_norm": 0.14367084205150604, + "learning_rate": 0.001, + "loss": 2.123, + "num_input_tokens_seen": 41700327904, + "step": 79550 + }, + { + "epoch": 0.7593880057717305, + "grad_norm": 0.1638575941324234, + "learning_rate": 0.001, + "loss": 2.1228, + "num_input_tokens_seen": 41726541536, + "step": 79600 + }, + { + "epoch": 0.7598650082879187, + "grad_norm": 0.14226502180099487, + "learning_rate": 0.001, + "loss": 2.1423, + "num_input_tokens_seen": 41752751392, + "step": 79650 + }, + { + "epoch": 0.760342010804107, + "grad_norm": 0.13502418994903564, + "learning_rate": 0.001, + "loss": 2.12, + "num_input_tokens_seen": 41778958816, + "step": 79700 + }, + { + "epoch": 0.7608190133202952, + "grad_norm": 0.1341133862733841, + "learning_rate": 0.001, + "loss": 2.117, + "num_input_tokens_seen": 41805163168, + "step": 79750 + }, + { + "epoch": 0.7612960158364835, + "grad_norm": 0.14015237987041473, + "learning_rate": 0.001, + "loss": 2.1248, + "num_input_tokens_seen": 41831374080, + "step": 79800 + }, + { + "epoch": 0.7617730183526719, + "grad_norm": 0.14166907966136932, + "learning_rate": 0.001, + "loss": 2.1307, + "num_input_tokens_seen": 41857587424, + "step": 79850 + }, + { + "epoch": 0.7622500208688601, + "grad_norm": 0.21544745564460754, + "learning_rate": 0.001, + "loss": 2.1442, + "num_input_tokens_seen": 41883794976, + "step": 79900 + }, + { + "epoch": 0.7627270233850484, + "grad_norm": 0.13640902936458588, + "learning_rate": 0.001, + "loss": 2.157, + "num_input_tokens_seen": 41910007040, + "step": 79950 + }, + { + "epoch": 0.7632040259012366, + "grad_norm": 0.13672830164432526, + "learning_rate": 0.001, + "loss": 2.129, + "num_input_tokens_seen": 41936220160, + "step": 80000 + }, + { + "epoch": 0.7632040259012366, + "eval_loss": 2.04022479057312, + "eval_runtime": 79.3648, + "eval_samples_per_second": 63.0, + "eval_steps_per_second": 15.75, + "num_input_tokens_seen": 41936220160, + "step": 80000 + }, + { + "epoch": 0.7636810284174249, + "grad_norm": 0.1411873996257782, + "learning_rate": 0.001, + "loss": 2.1271, + "num_input_tokens_seen": 41962432768, + "step": 80050 + }, + { + "epoch": 0.7641580309336131, + "grad_norm": 0.14791899919509888, + "learning_rate": 0.001, + "loss": 2.1298, + "num_input_tokens_seen": 41988646144, + "step": 80100 + }, + { + "epoch": 0.7646350334498014, + "grad_norm": 0.1465454399585724, + "learning_rate": 0.001, + "loss": 2.1185, + "num_input_tokens_seen": 42014860544, + "step": 80150 + }, + { + "epoch": 0.7651120359659898, + "grad_norm": 0.1528947502374649, + "learning_rate": 0.001, + "loss": 2.115, + "num_input_tokens_seen": 42041067744, + "step": 80200 + }, + { + "epoch": 0.765589038482178, + "grad_norm": 0.21168603003025055, + "learning_rate": 0.001, + "loss": 2.1304, + "num_input_tokens_seen": 42067282144, + "step": 80250 + }, + { + "epoch": 0.7660660409983663, + "grad_norm": 0.1547636091709137, + "learning_rate": 0.001, + "loss": 2.1268, + "num_input_tokens_seen": 42093496544, + "step": 80300 + }, + { + "epoch": 0.7665430435145545, + "grad_norm": 0.1418161541223526, + "learning_rate": 0.001, + "loss": 2.1273, + "num_input_tokens_seen": 42119710944, + "step": 80350 + }, + { + "epoch": 0.7670200460307428, + "grad_norm": 0.1477021723985672, + "learning_rate": 0.001, + "loss": 2.1261, + "num_input_tokens_seen": 42145920448, + "step": 80400 + }, + { + "epoch": 0.7674970485469311, + "grad_norm": 0.14230799674987793, + "learning_rate": 0.001, + "loss": 2.118, + "num_input_tokens_seen": 42172131008, + "step": 80450 + }, + { + "epoch": 0.7679740510631193, + "grad_norm": 0.14658768475055695, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 42198339200, + "step": 80500 + }, + { + "epoch": 0.7679740510631193, + "eval_loss": 2.0385217666625977, + "eval_runtime": 79.7406, + "eval_samples_per_second": 62.703, + "eval_steps_per_second": 15.676, + "num_input_tokens_seen": 42198339200, + "step": 80500 + }, + { + "epoch": 0.7684510535793077, + "grad_norm": 0.14163857698440552, + "learning_rate": 0.001, + "loss": 2.1172, + "num_input_tokens_seen": 42224552832, + "step": 80550 + }, + { + "epoch": 0.7689280560954959, + "grad_norm": 0.14124558866024017, + "learning_rate": 0.001, + "loss": 2.1313, + "num_input_tokens_seen": 42250764928, + "step": 80600 + }, + { + "epoch": 0.7694050586116842, + "grad_norm": 0.14133115112781525, + "learning_rate": 0.001, + "loss": 2.1207, + "num_input_tokens_seen": 42276974400, + "step": 80650 + }, + { + "epoch": 0.7698820611278725, + "grad_norm": 0.15105165541172028, + "learning_rate": 0.001, + "loss": 2.1235, + "num_input_tokens_seen": 42303188800, + "step": 80700 + }, + { + "epoch": 0.7703590636440607, + "grad_norm": 0.1437007337808609, + "learning_rate": 0.001, + "loss": 2.1153, + "num_input_tokens_seen": 42329398016, + "step": 80750 + }, + { + "epoch": 0.770836066160249, + "grad_norm": 0.138414204120636, + "learning_rate": 0.001, + "loss": 2.1218, + "num_input_tokens_seen": 42355602880, + "step": 80800 + }, + { + "epoch": 0.7713130686764372, + "grad_norm": 0.15313681960105896, + "learning_rate": 0.001, + "loss": 2.1272, + "num_input_tokens_seen": 42381812672, + "step": 80850 + }, + { + "epoch": 0.7717900711926255, + "grad_norm": 0.1474558264017105, + "learning_rate": 0.001, + "loss": 2.1233, + "num_input_tokens_seen": 42408026560, + "step": 80900 + }, + { + "epoch": 0.7722670737088139, + "grad_norm": 0.14552924036979675, + "learning_rate": 0.001, + "loss": 2.1171, + "num_input_tokens_seen": 42434238880, + "step": 80950 + }, + { + "epoch": 0.7727440762250021, + "grad_norm": 0.14388687908649445, + "learning_rate": 0.001, + "loss": 2.1147, + "num_input_tokens_seen": 42460451136, + "step": 81000 + }, + { + "epoch": 0.7727440762250021, + "eval_loss": 2.0367112159729004, + "eval_runtime": 79.262, + "eval_samples_per_second": 63.082, + "eval_steps_per_second": 15.77, + "num_input_tokens_seen": 42460451136, + "step": 81000 + }, + { + "epoch": 0.7732210787411904, + "grad_norm": 0.14675873517990112, + "learning_rate": 0.001, + "loss": 2.1202, + "num_input_tokens_seen": 42486665536, + "step": 81050 + }, + { + "epoch": 0.7736980812573786, + "grad_norm": 0.14389154314994812, + "learning_rate": 0.001, + "loss": 2.1267, + "num_input_tokens_seen": 42512873152, + "step": 81100 + }, + { + "epoch": 0.7741750837735669, + "grad_norm": 0.13341355323791504, + "learning_rate": 0.001, + "loss": 2.1242, + "num_input_tokens_seen": 42539087552, + "step": 81150 + }, + { + "epoch": 0.7746520862897551, + "grad_norm": 0.14013975858688354, + "learning_rate": 0.001, + "loss": 2.1256, + "num_input_tokens_seen": 42565299392, + "step": 81200 + }, + { + "epoch": 0.7751290888059434, + "grad_norm": 0.1397426873445511, + "learning_rate": 0.001, + "loss": 2.1191, + "num_input_tokens_seen": 42591510720, + "step": 81250 + }, + { + "epoch": 0.7756060913221318, + "grad_norm": 0.148366779088974, + "learning_rate": 0.001, + "loss": 2.121, + "num_input_tokens_seen": 42617720416, + "step": 81300 + }, + { + "epoch": 0.77608309383832, + "grad_norm": 0.14177195727825165, + "learning_rate": 0.001, + "loss": 2.1253, + "num_input_tokens_seen": 42643931360, + "step": 81350 + }, + { + "epoch": 0.7765600963545083, + "grad_norm": 0.15625756978988647, + "learning_rate": 0.001, + "loss": 2.1161, + "num_input_tokens_seen": 42670145344, + "step": 81400 + }, + { + "epoch": 0.7770370988706965, + "grad_norm": 0.16141097247600555, + "learning_rate": 0.001, + "loss": 2.1144, + "num_input_tokens_seen": 42696355872, + "step": 81450 + }, + { + "epoch": 0.7775141013868848, + "grad_norm": 0.14571966230869293, + "learning_rate": 0.001, + "loss": 2.1265, + "num_input_tokens_seen": 42722562592, + "step": 81500 + }, + { + "epoch": 0.7775141013868848, + "eval_loss": 2.0364601612091064, + "eval_runtime": 79.6872, + "eval_samples_per_second": 62.745, + "eval_steps_per_second": 15.686, + "num_input_tokens_seen": 42722562592, + "step": 81500 + }, + { + "epoch": 0.7779911039030731, + "grad_norm": 0.14065228402614594, + "learning_rate": 0.001, + "loss": 2.1229, + "num_input_tokens_seen": 42748776992, + "step": 81550 + }, + { + "epoch": 0.7784681064192613, + "grad_norm": 0.13356003165245056, + "learning_rate": 0.001, + "loss": 2.1122, + "num_input_tokens_seen": 42774986944, + "step": 81600 + }, + { + "epoch": 0.7789451089354497, + "grad_norm": 0.1398439258337021, + "learning_rate": 0.001, + "loss": 2.1304, + "num_input_tokens_seen": 42801193248, + "step": 81650 + }, + { + "epoch": 0.7794221114516379, + "grad_norm": 0.14399580657482147, + "learning_rate": 0.001, + "loss": 2.1191, + "num_input_tokens_seen": 42827397664, + "step": 81700 + }, + { + "epoch": 0.7798991139678262, + "grad_norm": 0.1511550098657608, + "learning_rate": 0.001, + "loss": 2.1299, + "num_input_tokens_seen": 42853609184, + "step": 81750 + }, + { + "epoch": 0.7803761164840145, + "grad_norm": 0.13643226027488708, + "learning_rate": 0.001, + "loss": 2.1176, + "num_input_tokens_seen": 42879823584, + "step": 81800 + }, + { + "epoch": 0.7808531190002027, + "grad_norm": 0.15320724248886108, + "learning_rate": 0.001, + "loss": 2.1268, + "num_input_tokens_seen": 42906032096, + "step": 81850 + }, + { + "epoch": 0.781330121516391, + "grad_norm": 0.15477311611175537, + "learning_rate": 0.001, + "loss": 2.1173, + "num_input_tokens_seen": 42932238784, + "step": 81900 + }, + { + "epoch": 0.7818071240325792, + "grad_norm": 0.1393759399652481, + "learning_rate": 0.001, + "loss": 2.1148, + "num_input_tokens_seen": 42958444704, + "step": 81950 + }, + { + "epoch": 0.7822841265487676, + "grad_norm": 0.14024987816810608, + "learning_rate": 0.001, + "loss": 2.1254, + "num_input_tokens_seen": 42984652928, + "step": 82000 + }, + { + "epoch": 0.7822841265487676, + "eval_loss": 2.035764694213867, + "eval_runtime": 80.5252, + "eval_samples_per_second": 62.092, + "eval_steps_per_second": 15.523, + "num_input_tokens_seen": 42984652928, + "step": 82000 + }, + { + "epoch": 0.7827611290649558, + "grad_norm": 0.14049945771694183, + "learning_rate": 0.001, + "loss": 2.1148, + "num_input_tokens_seen": 43010864736, + "step": 82050 + }, + { + "epoch": 0.7832381315811441, + "grad_norm": 0.14106184244155884, + "learning_rate": 0.001, + "loss": 2.127, + "num_input_tokens_seen": 43037075424, + "step": 82100 + }, + { + "epoch": 0.7837151340973324, + "grad_norm": 0.153049498796463, + "learning_rate": 0.001, + "loss": 2.1243, + "num_input_tokens_seen": 43063287616, + "step": 82150 + }, + { + "epoch": 0.7841921366135206, + "grad_norm": 0.13522286713123322, + "learning_rate": 0.001, + "loss": 2.1202, + "num_input_tokens_seen": 43089496256, + "step": 82200 + }, + { + "epoch": 0.7846691391297089, + "grad_norm": 0.1484510898590088, + "learning_rate": 0.001, + "loss": 2.1174, + "num_input_tokens_seen": 43115709216, + "step": 82250 + }, + { + "epoch": 0.7851461416458971, + "grad_norm": 0.13766340911388397, + "learning_rate": 0.001, + "loss": 2.1113, + "num_input_tokens_seen": 43141917216, + "step": 82300 + }, + { + "epoch": 0.7856231441620855, + "grad_norm": 0.15031974017620087, + "learning_rate": 0.001, + "loss": 2.1122, + "num_input_tokens_seen": 43168114496, + "step": 82350 + }, + { + "epoch": 0.7861001466782738, + "grad_norm": 0.1554354727268219, + "learning_rate": 0.001, + "loss": 2.1179, + "num_input_tokens_seen": 43194316224, + "step": 82400 + }, + { + "epoch": 0.786577149194462, + "grad_norm": 0.1402343511581421, + "learning_rate": 0.001, + "loss": 2.1107, + "num_input_tokens_seen": 43220521024, + "step": 82450 + }, + { + "epoch": 0.7870541517106503, + "grad_norm": 0.1455002725124359, + "learning_rate": 0.001, + "loss": 2.1121, + "num_input_tokens_seen": 43246727552, + "step": 82500 + }, + { + "epoch": 0.7870541517106503, + "eval_loss": 2.0353808403015137, + "eval_runtime": 79.7931, + "eval_samples_per_second": 62.662, + "eval_steps_per_second": 15.666, + "num_input_tokens_seen": 43246727552, + "step": 82500 + }, + { + "epoch": 0.7875311542268385, + "grad_norm": 0.13784560561180115, + "learning_rate": 0.001, + "loss": 2.1218, + "num_input_tokens_seen": 43272941952, + "step": 82550 + }, + { + "epoch": 0.7880081567430268, + "grad_norm": 0.16629286110401154, + "learning_rate": 0.001, + "loss": 2.1257, + "num_input_tokens_seen": 43299148416, + "step": 82600 + }, + { + "epoch": 0.7884851592592151, + "grad_norm": 0.14138463139533997, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 43325357152, + "step": 82650 + }, + { + "epoch": 0.7889621617754033, + "grad_norm": 0.1476413905620575, + "learning_rate": 0.001, + "loss": 2.1261, + "num_input_tokens_seen": 43351571552, + "step": 82700 + }, + { + "epoch": 0.7894391642915917, + "grad_norm": 0.14458167552947998, + "learning_rate": 0.001, + "loss": 2.1186, + "num_input_tokens_seen": 43377781824, + "step": 82750 + }, + { + "epoch": 0.7899161668077799, + "grad_norm": 0.14135339856147766, + "learning_rate": 0.001, + "loss": 2.1145, + "num_input_tokens_seen": 43403986336, + "step": 82800 + }, + { + "epoch": 0.7903931693239682, + "grad_norm": 0.1399611085653305, + "learning_rate": 0.001, + "loss": 2.1116, + "num_input_tokens_seen": 43430200736, + "step": 82850 + }, + { + "epoch": 0.7908701718401565, + "grad_norm": 0.14598487317562103, + "learning_rate": 0.001, + "loss": 2.1219, + "num_input_tokens_seen": 43456414848, + "step": 82900 + }, + { + "epoch": 0.7913471743563447, + "grad_norm": 0.14239011704921722, + "learning_rate": 0.001, + "loss": 2.1049, + "num_input_tokens_seen": 43482628512, + "step": 82950 + }, + { + "epoch": 0.791824176872533, + "grad_norm": 0.1495833843946457, + "learning_rate": 0.001, + "loss": 2.1235, + "num_input_tokens_seen": 43508842912, + "step": 83000 + }, + { + "epoch": 0.791824176872533, + "eval_loss": 2.0351996421813965, + "eval_runtime": 80.7069, + "eval_samples_per_second": 61.953, + "eval_steps_per_second": 15.488, + "num_input_tokens_seen": 43508842912, + "step": 83000 + }, + { + "epoch": 0.7923011793887212, + "grad_norm": 0.14430342614650726, + "learning_rate": 0.001, + "loss": 2.1139, + "num_input_tokens_seen": 43535053536, + "step": 83050 + }, + { + "epoch": 0.7927781819049096, + "grad_norm": 0.14061962068080902, + "learning_rate": 0.001, + "loss": 2.1227, + "num_input_tokens_seen": 43561267936, + "step": 83100 + }, + { + "epoch": 0.7932551844210978, + "grad_norm": 0.14604133367538452, + "learning_rate": 0.001, + "loss": 2.1224, + "num_input_tokens_seen": 43587477408, + "step": 83150 + }, + { + "epoch": 0.7937321869372861, + "grad_norm": 0.1432175487279892, + "learning_rate": 0.001, + "loss": 2.1277, + "num_input_tokens_seen": 43613684960, + "step": 83200 + }, + { + "epoch": 0.7942091894534744, + "grad_norm": 0.15234777331352234, + "learning_rate": 0.001, + "loss": 2.1082, + "num_input_tokens_seen": 43639894112, + "step": 83250 + }, + { + "epoch": 0.7946861919696626, + "grad_norm": 0.14436079561710358, + "learning_rate": 0.001, + "loss": 2.1207, + "num_input_tokens_seen": 43666103104, + "step": 83300 + }, + { + "epoch": 0.7951631944858509, + "grad_norm": 0.14395667612552643, + "learning_rate": 0.001, + "loss": 2.1219, + "num_input_tokens_seen": 43692313184, + "step": 83350 + }, + { + "epoch": 0.7956401970020391, + "grad_norm": 0.13969875872135162, + "learning_rate": 0.001, + "loss": 2.1152, + "num_input_tokens_seen": 43718525536, + "step": 83400 + }, + { + "epoch": 0.7961171995182275, + "grad_norm": 0.151366725564003, + "learning_rate": 0.001, + "loss": 2.1168, + "num_input_tokens_seen": 43744737728, + "step": 83450 + }, + { + "epoch": 0.7965942020344158, + "grad_norm": 0.13248160481452942, + "learning_rate": 0.001, + "loss": 2.1192, + "num_input_tokens_seen": 43770947360, + "step": 83500 + }, + { + "epoch": 0.7965942020344158, + "eval_loss": 2.0351762771606445, + "eval_runtime": 79.3184, + "eval_samples_per_second": 63.037, + "eval_steps_per_second": 15.759, + "num_input_tokens_seen": 43770947360, + "step": 83500 + }, + { + "epoch": 0.797071204550604, + "grad_norm": 0.14381654560565948, + "learning_rate": 0.001, + "loss": 2.1128, + "num_input_tokens_seen": 43797158272, + "step": 83550 + }, + { + "epoch": 0.7975482070667923, + "grad_norm": 0.1607636660337448, + "learning_rate": 0.001, + "loss": 2.1185, + "num_input_tokens_seen": 43823371904, + "step": 83600 + }, + { + "epoch": 0.7980252095829805, + "grad_norm": 0.1534896194934845, + "learning_rate": 0.001, + "loss": 2.1191, + "num_input_tokens_seen": 43849584512, + "step": 83650 + }, + { + "epoch": 0.7985022120991688, + "grad_norm": 0.1401808112859726, + "learning_rate": 0.001, + "loss": 2.1228, + "num_input_tokens_seen": 43875796832, + "step": 83700 + }, + { + "epoch": 0.7989792146153571, + "grad_norm": 0.15275578200817108, + "learning_rate": 0.001, + "loss": 2.1153, + "num_input_tokens_seen": 43902011232, + "step": 83750 + }, + { + "epoch": 0.7994562171315454, + "grad_norm": 0.13924409449100494, + "learning_rate": 0.001, + "loss": 2.1208, + "num_input_tokens_seen": 43928216352, + "step": 83800 + }, + { + "epoch": 0.7999332196477337, + "grad_norm": 0.18342813849449158, + "learning_rate": 0.001, + "loss": 2.1694, + "num_input_tokens_seen": 43954427648, + "step": 83850 + }, + { + "epoch": 0.8004102221639219, + "grad_norm": 0.14373578131198883, + "learning_rate": 0.001, + "loss": 2.1333, + "num_input_tokens_seen": 43980640928, + "step": 83900 + }, + { + "epoch": 0.8008872246801102, + "grad_norm": 0.1297065019607544, + "learning_rate": 0.001, + "loss": 2.134, + "num_input_tokens_seen": 44006842240, + "step": 83950 + }, + { + "epoch": 0.8013642271962984, + "grad_norm": 0.14140845835208893, + "learning_rate": 0.001, + "loss": 2.1303, + "num_input_tokens_seen": 44033053408, + "step": 84000 + }, + { + "epoch": 0.8013642271962984, + "eval_loss": 2.038163185119629, + "eval_runtime": 80.0722, + "eval_samples_per_second": 62.444, + "eval_steps_per_second": 15.611, + "num_input_tokens_seen": 44033053408, + "step": 84000 + }, + { + "epoch": 0.8018412297124867, + "grad_norm": 0.13576118648052216, + "learning_rate": 0.001, + "loss": 2.1238, + "num_input_tokens_seen": 44059265280, + "step": 84050 + }, + { + "epoch": 0.802318232228675, + "grad_norm": 0.14275366067886353, + "learning_rate": 0.001, + "loss": 2.1173, + "num_input_tokens_seen": 44085474144, + "step": 84100 + }, + { + "epoch": 0.8027952347448633, + "grad_norm": 0.1358124315738678, + "learning_rate": 0.001, + "loss": 2.1174, + "num_input_tokens_seen": 44111683840, + "step": 84150 + }, + { + "epoch": 0.8032722372610516, + "grad_norm": 0.14228695631027222, + "learning_rate": 0.001, + "loss": 2.1142, + "num_input_tokens_seen": 44137898240, + "step": 84200 + }, + { + "epoch": 0.8037492397772398, + "grad_norm": 0.13854092359542847, + "learning_rate": 0.001, + "loss": 2.1099, + "num_input_tokens_seen": 44164112544, + "step": 84250 + }, + { + "epoch": 0.8042262422934281, + "grad_norm": 0.13311374187469482, + "learning_rate": 0.001, + "loss": 2.1177, + "num_input_tokens_seen": 44190326560, + "step": 84300 + }, + { + "epoch": 0.8047032448096164, + "grad_norm": 0.13964787125587463, + "learning_rate": 0.001, + "loss": 2.122, + "num_input_tokens_seen": 44216539296, + "step": 84350 + }, + { + "epoch": 0.8051802473258046, + "grad_norm": 0.13378119468688965, + "learning_rate": 0.001, + "loss": 2.1147, + "num_input_tokens_seen": 44242753696, + "step": 84400 + }, + { + "epoch": 0.8056572498419929, + "grad_norm": 0.1523156464099884, + "learning_rate": 0.001, + "loss": 2.1218, + "num_input_tokens_seen": 44268968096, + "step": 84450 + }, + { + "epoch": 0.8061342523581811, + "grad_norm": 0.14103132486343384, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 44295182016, + "step": 84500 + }, + { + "epoch": 0.8061342523581811, + "eval_loss": 2.033738374710083, + "eval_runtime": 79.8888, + "eval_samples_per_second": 62.587, + "eval_steps_per_second": 15.647, + "num_input_tokens_seen": 44295182016, + "step": 84500 + }, + { + "epoch": 0.8066112548743695, + "grad_norm": 0.14938974380493164, + "learning_rate": 0.001, + "loss": 2.1105, + "num_input_tokens_seen": 44321393184, + "step": 84550 + }, + { + "epoch": 0.8070882573905578, + "grad_norm": 0.15471114218235016, + "learning_rate": 0.001, + "loss": 2.124, + "num_input_tokens_seen": 44347607584, + "step": 84600 + }, + { + "epoch": 0.807565259906746, + "grad_norm": 0.14569403231143951, + "learning_rate": 0.001, + "loss": 2.1155, + "num_input_tokens_seen": 44373816224, + "step": 84650 + }, + { + "epoch": 0.8080422624229343, + "grad_norm": 0.1456989198923111, + "learning_rate": 0.001, + "loss": 2.1251, + "num_input_tokens_seen": 44400021216, + "step": 84700 + }, + { + "epoch": 0.8085192649391225, + "grad_norm": 0.1367526352405548, + "learning_rate": 0.001, + "loss": 2.1146, + "num_input_tokens_seen": 44426223008, + "step": 84750 + }, + { + "epoch": 0.8089962674553108, + "grad_norm": 0.14826616644859314, + "learning_rate": 0.001, + "loss": 2.1185, + "num_input_tokens_seen": 44452436000, + "step": 84800 + }, + { + "epoch": 0.8094732699714992, + "grad_norm": 0.15011751651763916, + "learning_rate": 0.001, + "loss": 2.1281, + "num_input_tokens_seen": 44478650368, + "step": 84850 + }, + { + "epoch": 0.8099502724876874, + "grad_norm": 0.14236512780189514, + "learning_rate": 0.001, + "loss": 2.1205, + "num_input_tokens_seen": 44504864768, + "step": 84900 + }, + { + "epoch": 0.8104272750038757, + "grad_norm": 0.14300031960010529, + "learning_rate": 0.001, + "loss": 2.1319, + "num_input_tokens_seen": 44531075200, + "step": 84950 + }, + { + "epoch": 0.8109042775200639, + "grad_norm": 0.13161155581474304, + "learning_rate": 0.001, + "loss": 2.125, + "num_input_tokens_seen": 44557289600, + "step": 85000 + }, + { + "epoch": 0.8109042775200639, + "eval_loss": 2.032745599746704, + "eval_runtime": 80.1389, + "eval_samples_per_second": 62.392, + "eval_steps_per_second": 15.598, + "num_input_tokens_seen": 44557289600, + "step": 85000 + }, + { + "epoch": 0.8113812800362522, + "grad_norm": 0.15233619511127472, + "learning_rate": 0.001, + "loss": 2.1097, + "num_input_tokens_seen": 44583500576, + "step": 85050 + }, + { + "epoch": 0.8118582825524404, + "grad_norm": 0.15628154575824738, + "learning_rate": 0.001, + "loss": 2.1166, + "num_input_tokens_seen": 44609714112, + "step": 85100 + }, + { + "epoch": 0.8123352850686287, + "grad_norm": 0.15254861116409302, + "learning_rate": 0.001, + "loss": 2.1243, + "num_input_tokens_seen": 44635927648, + "step": 85150 + }, + { + "epoch": 0.812812287584817, + "grad_norm": 0.1345020979642868, + "learning_rate": 0.001, + "loss": 2.1265, + "num_input_tokens_seen": 44662141408, + "step": 85200 + }, + { + "epoch": 0.8132892901010053, + "grad_norm": 0.14372238516807556, + "learning_rate": 0.001, + "loss": 2.1238, + "num_input_tokens_seen": 44688355808, + "step": 85250 + }, + { + "epoch": 0.8137662926171936, + "grad_norm": 0.14160767197608948, + "learning_rate": 0.001, + "loss": 2.1276, + "num_input_tokens_seen": 44714565152, + "step": 85300 + }, + { + "epoch": 0.8142432951333818, + "grad_norm": 0.15178006887435913, + "learning_rate": 0.001, + "loss": 2.1171, + "num_input_tokens_seen": 44740776032, + "step": 85350 + }, + { + "epoch": 0.8147202976495701, + "grad_norm": 0.1339855045080185, + "learning_rate": 0.001, + "loss": 2.0985, + "num_input_tokens_seen": 44766988608, + "step": 85400 + }, + { + "epoch": 0.8151973001657584, + "grad_norm": 0.1480085402727127, + "learning_rate": 0.001, + "loss": 2.1102, + "num_input_tokens_seen": 44793200896, + "step": 85450 + }, + { + "epoch": 0.8156743026819466, + "grad_norm": 0.18293645977973938, + "learning_rate": 0.001, + "loss": 2.1218, + "num_input_tokens_seen": 44819414752, + "step": 85500 + }, + { + "epoch": 0.8156743026819466, + "eval_loss": 2.031679630279541, + "eval_runtime": 79.5824, + "eval_samples_per_second": 62.828, + "eval_steps_per_second": 15.707, + "num_input_tokens_seen": 44819414752, + "step": 85500 + }, + { + "epoch": 0.816151305198135, + "grad_norm": 0.14524328708648682, + "learning_rate": 0.001, + "loss": 2.1085, + "num_input_tokens_seen": 44845612704, + "step": 85550 + }, + { + "epoch": 0.8166283077143232, + "grad_norm": 0.14643549919128418, + "learning_rate": 0.001, + "loss": 2.1151, + "num_input_tokens_seen": 44871822016, + "step": 85600 + }, + { + "epoch": 0.8171053102305115, + "grad_norm": 0.14329664409160614, + "learning_rate": 0.001, + "loss": 2.1226, + "num_input_tokens_seen": 44898035456, + "step": 85650 + }, + { + "epoch": 0.8175823127466998, + "grad_norm": 0.14474226534366608, + "learning_rate": 0.001, + "loss": 2.109, + "num_input_tokens_seen": 44924248160, + "step": 85700 + }, + { + "epoch": 0.818059315262888, + "grad_norm": 0.15638591349124908, + "learning_rate": 0.001, + "loss": 2.1111, + "num_input_tokens_seen": 44950456736, + "step": 85750 + }, + { + "epoch": 0.8185363177790763, + "grad_norm": 0.14359885454177856, + "learning_rate": 0.001, + "loss": 2.1123, + "num_input_tokens_seen": 44976671136, + "step": 85800 + }, + { + "epoch": 0.8190133202952645, + "grad_norm": 0.14419220387935638, + "learning_rate": 0.001, + "loss": 2.1071, + "num_input_tokens_seen": 45002877984, + "step": 85850 + }, + { + "epoch": 0.8194903228114528, + "grad_norm": 0.1485709697008133, + "learning_rate": 0.001, + "loss": 2.1094, + "num_input_tokens_seen": 45029088992, + "step": 85900 + }, + { + "epoch": 0.819967325327641, + "grad_norm": 0.14082056283950806, + "learning_rate": 0.001, + "loss": 2.1182, + "num_input_tokens_seen": 45055300768, + "step": 85950 + }, + { + "epoch": 0.8204443278438294, + "grad_norm": 0.13490447402000427, + "learning_rate": 0.001, + "loss": 2.1122, + "num_input_tokens_seen": 45081505024, + "step": 86000 + }, + { + "epoch": 0.8204443278438294, + "eval_loss": 2.0312297344207764, + "eval_runtime": 80.2167, + "eval_samples_per_second": 62.331, + "eval_steps_per_second": 15.583, + "num_input_tokens_seen": 45081505024, + "step": 86000 + }, + { + "epoch": 0.8209213303600177, + "grad_norm": 0.14423319697380066, + "learning_rate": 0.001, + "loss": 2.1117, + "num_input_tokens_seen": 45107719424, + "step": 86050 + }, + { + "epoch": 0.8213983328762059, + "grad_norm": 0.14507217705249786, + "learning_rate": 0.001, + "loss": 2.1116, + "num_input_tokens_seen": 45133929184, + "step": 86100 + }, + { + "epoch": 0.8218753353923942, + "grad_norm": 0.14613692462444305, + "learning_rate": 0.001, + "loss": 2.1068, + "num_input_tokens_seen": 45160135904, + "step": 86150 + }, + { + "epoch": 0.8223523379085824, + "grad_norm": 0.15299580991268158, + "learning_rate": 0.001, + "loss": 2.1134, + "num_input_tokens_seen": 45186346976, + "step": 86200 + }, + { + "epoch": 0.8228293404247707, + "grad_norm": 0.16637030243873596, + "learning_rate": 0.001, + "loss": 2.116, + "num_input_tokens_seen": 45212552576, + "step": 86250 + }, + { + "epoch": 0.8233063429409591, + "grad_norm": 0.14588510990142822, + "learning_rate": 0.001, + "loss": 2.1157, + "num_input_tokens_seen": 45238766976, + "step": 86300 + }, + { + "epoch": 0.8237833454571473, + "grad_norm": 0.1367158144712448, + "learning_rate": 0.001, + "loss": 2.1215, + "num_input_tokens_seen": 45264975488, + "step": 86350 + }, + { + "epoch": 0.8242603479733356, + "grad_norm": 0.14144419133663177, + "learning_rate": 0.001, + "loss": 2.1124, + "num_input_tokens_seen": 45291175360, + "step": 86400 + }, + { + "epoch": 0.8247373504895238, + "grad_norm": 0.1478971391916275, + "learning_rate": 0.001, + "loss": 2.1151, + "num_input_tokens_seen": 45317382912, + "step": 86450 + }, + { + "epoch": 0.8252143530057121, + "grad_norm": 0.14577680826187134, + "learning_rate": 0.001, + "loss": 2.1293, + "num_input_tokens_seen": 45343592896, + "step": 86500 + }, + { + "epoch": 0.8252143530057121, + "eval_loss": 2.0312957763671875, + "eval_runtime": 80.127, + "eval_samples_per_second": 62.401, + "eval_steps_per_second": 15.6, + "num_input_tokens_seen": 45343592896, + "step": 86500 + }, + { + "epoch": 0.8256913555219004, + "grad_norm": 0.1390218287706375, + "learning_rate": 0.001, + "loss": 2.1133, + "num_input_tokens_seen": 45369805792, + "step": 86550 + }, + { + "epoch": 0.8261683580380886, + "grad_norm": 0.1416807472705841, + "learning_rate": 0.001, + "loss": 2.1144, + "num_input_tokens_seen": 45396019232, + "step": 86600 + }, + { + "epoch": 0.826645360554277, + "grad_norm": 0.1379116177558899, + "learning_rate": 0.001, + "loss": 2.1105, + "num_input_tokens_seen": 45422224160, + "step": 86650 + }, + { + "epoch": 0.8271223630704652, + "grad_norm": 0.13901980221271515, + "learning_rate": 0.001, + "loss": 2.1092, + "num_input_tokens_seen": 45448438560, + "step": 86700 + }, + { + "epoch": 0.8275993655866535, + "grad_norm": 0.14398328959941864, + "learning_rate": 0.001, + "loss": 2.1071, + "num_input_tokens_seen": 45474652960, + "step": 86750 + }, + { + "epoch": 0.8280763681028418, + "grad_norm": 0.14946867525577545, + "learning_rate": 0.001, + "loss": 2.1299, + "num_input_tokens_seen": 45500863392, + "step": 86800 + }, + { + "epoch": 0.82855337061903, + "grad_norm": 0.14332331717014313, + "learning_rate": 0.001, + "loss": 2.1204, + "num_input_tokens_seen": 45527074752, + "step": 86850 + }, + { + "epoch": 0.8290303731352183, + "grad_norm": 0.15574564039707184, + "learning_rate": 0.001, + "loss": 2.1226, + "num_input_tokens_seen": 45553284800, + "step": 86900 + }, + { + "epoch": 0.8295073756514065, + "grad_norm": 0.12894290685653687, + "learning_rate": 0.001, + "loss": 2.1195, + "num_input_tokens_seen": 45579494208, + "step": 86950 + }, + { + "epoch": 0.8299843781675948, + "grad_norm": 0.14729012548923492, + "learning_rate": 0.001, + "loss": 2.1161, + "num_input_tokens_seen": 45605708608, + "step": 87000 + }, + { + "epoch": 0.8299843781675948, + "eval_loss": 2.0334672927856445, + "eval_runtime": 80.1275, + "eval_samples_per_second": 62.401, + "eval_steps_per_second": 15.6, + "num_input_tokens_seen": 45605708608, + "step": 87000 + }, + { + "epoch": 0.8304613806837831, + "grad_norm": 0.1420363485813141, + "learning_rate": 0.001, + "loss": 2.1129, + "num_input_tokens_seen": 45631921280, + "step": 87050 + }, + { + "epoch": 0.8309383831999714, + "grad_norm": 0.15057820081710815, + "learning_rate": 0.001, + "loss": 2.1131, + "num_input_tokens_seen": 45658134272, + "step": 87100 + }, + { + "epoch": 0.8314153857161597, + "grad_norm": 0.1428225189447403, + "learning_rate": 0.001, + "loss": 2.0981, + "num_input_tokens_seen": 45684341440, + "step": 87150 + }, + { + "epoch": 0.8318923882323479, + "grad_norm": 0.1462431401014328, + "learning_rate": 0.001, + "loss": 2.1211, + "num_input_tokens_seen": 45710546944, + "step": 87200 + }, + { + "epoch": 0.8323693907485362, + "grad_norm": 0.14011114835739136, + "learning_rate": 0.001, + "loss": 2.1104, + "num_input_tokens_seen": 45736761344, + "step": 87250 + }, + { + "epoch": 0.8328463932647244, + "grad_norm": 0.14002341032028198, + "learning_rate": 0.001, + "loss": 2.1158, + "num_input_tokens_seen": 45762972928, + "step": 87300 + }, + { + "epoch": 0.8333233957809127, + "grad_norm": 0.14873993396759033, + "learning_rate": 0.001, + "loss": 2.116, + "num_input_tokens_seen": 45789185152, + "step": 87350 + }, + { + "epoch": 0.8338003982971011, + "grad_norm": 0.15025608241558075, + "learning_rate": 0.001, + "loss": 2.1213, + "num_input_tokens_seen": 45815394784, + "step": 87400 + }, + { + "epoch": 0.8342774008132893, + "grad_norm": 0.23774513602256775, + "learning_rate": 0.001, + "loss": 2.118, + "num_input_tokens_seen": 45841605312, + "step": 87450 + }, + { + "epoch": 0.8347544033294776, + "grad_norm": 0.15170574188232422, + "learning_rate": 0.001, + "loss": 2.1237, + "num_input_tokens_seen": 45867808448, + "step": 87500 + }, + { + "epoch": 0.8347544033294776, + "eval_loss": 2.036484718322754, + "eval_runtime": 79.7877, + "eval_samples_per_second": 62.666, + "eval_steps_per_second": 15.667, + "num_input_tokens_seen": 45867808448, + "step": 87500 + }, + { + "epoch": 0.8352314058456658, + "grad_norm": 0.14227113127708435, + "learning_rate": 0.001, + "loss": 2.1245, + "num_input_tokens_seen": 45894014560, + "step": 87550 + }, + { + "epoch": 0.8357084083618541, + "grad_norm": 0.13852350413799286, + "learning_rate": 0.001, + "loss": 2.1221, + "num_input_tokens_seen": 45920222752, + "step": 87600 + }, + { + "epoch": 0.8361854108780424, + "grad_norm": 0.1606854796409607, + "learning_rate": 0.001, + "loss": 2.1237, + "num_input_tokens_seen": 45946435392, + "step": 87650 + }, + { + "epoch": 0.8366624133942306, + "grad_norm": 0.13357940316200256, + "learning_rate": 0.001, + "loss": 2.1191, + "num_input_tokens_seen": 45972631840, + "step": 87700 + }, + { + "epoch": 0.837139415910419, + "grad_norm": 0.1375136822462082, + "learning_rate": 0.001, + "loss": 2.1137, + "num_input_tokens_seen": 45998825152, + "step": 87750 + }, + { + "epoch": 0.8376164184266072, + "grad_norm": 0.14692631363868713, + "learning_rate": 0.001, + "loss": 2.1137, + "num_input_tokens_seen": 46025038752, + "step": 87800 + }, + { + "epoch": 0.8380934209427955, + "grad_norm": 0.1487261950969696, + "learning_rate": 0.001, + "loss": 2.12, + "num_input_tokens_seen": 46051252512, + "step": 87850 + }, + { + "epoch": 0.8385704234589837, + "grad_norm": 0.13279280066490173, + "learning_rate": 0.001, + "loss": 2.1101, + "num_input_tokens_seen": 46077466688, + "step": 87900 + }, + { + "epoch": 0.839047425975172, + "grad_norm": 0.17393696308135986, + "learning_rate": 0.001, + "loss": 2.1021, + "num_input_tokens_seen": 46103677312, + "step": 87950 + }, + { + "epoch": 0.8395244284913603, + "grad_norm": 0.13198982179164886, + "learning_rate": 0.001, + "loss": 2.1206, + "num_input_tokens_seen": 46129884928, + "step": 88000 + }, + { + "epoch": 0.8395244284913603, + "eval_loss": 2.031299591064453, + "eval_runtime": 79.9006, + "eval_samples_per_second": 62.578, + "eval_steps_per_second": 15.644, + "num_input_tokens_seen": 46129884928, + "step": 88000 + }, + { + "epoch": 0.8400014310075485, + "grad_norm": 0.1394035667181015, + "learning_rate": 0.001, + "loss": 2.1158, + "num_input_tokens_seen": 46156095744, + "step": 88050 + }, + { + "epoch": 0.8404784335237369, + "grad_norm": 0.13993045687675476, + "learning_rate": 0.001, + "loss": 2.1124, + "num_input_tokens_seen": 46182303680, + "step": 88100 + }, + { + "epoch": 0.8409554360399251, + "grad_norm": 0.13323958218097687, + "learning_rate": 0.001, + "loss": 2.1096, + "num_input_tokens_seen": 46208506784, + "step": 88150 + }, + { + "epoch": 0.8414324385561134, + "grad_norm": 0.15271630883216858, + "learning_rate": 0.001, + "loss": 2.1128, + "num_input_tokens_seen": 46234719360, + "step": 88200 + }, + { + "epoch": 0.8419094410723017, + "grad_norm": 0.14392182230949402, + "learning_rate": 0.001, + "loss": 2.1105, + "num_input_tokens_seen": 46260917728, + "step": 88250 + }, + { + "epoch": 0.8423864435884899, + "grad_norm": 0.14050635695457458, + "learning_rate": 0.001, + "loss": 2.1149, + "num_input_tokens_seen": 46287129376, + "step": 88300 + }, + { + "epoch": 0.8428634461046782, + "grad_norm": 0.15242235362529755, + "learning_rate": 0.001, + "loss": 2.1255, + "num_input_tokens_seen": 46313338304, + "step": 88350 + }, + { + "epoch": 0.8433404486208664, + "grad_norm": 0.1493886262178421, + "learning_rate": 0.001, + "loss": 2.118, + "num_input_tokens_seen": 46339548736, + "step": 88400 + }, + { + "epoch": 0.8438174511370548, + "grad_norm": 0.1382344514131546, + "learning_rate": 0.001, + "loss": 2.1152, + "num_input_tokens_seen": 46365752704, + "step": 88450 + }, + { + "epoch": 0.8442944536532431, + "grad_norm": 0.15339982509613037, + "learning_rate": 0.001, + "loss": 2.1191, + "num_input_tokens_seen": 46391967104, + "step": 88500 + }, + { + "epoch": 0.8442944536532431, + "eval_loss": 2.0305092334747314, + "eval_runtime": 79.8949, + "eval_samples_per_second": 62.582, + "eval_steps_per_second": 15.646, + "num_input_tokens_seen": 46391967104, + "step": 88500 + }, + { + "epoch": 0.8447714561694313, + "grad_norm": 0.1537119597196579, + "learning_rate": 0.001, + "loss": 2.1037, + "num_input_tokens_seen": 46418173888, + "step": 88550 + }, + { + "epoch": 0.8452484586856196, + "grad_norm": 0.14273403584957123, + "learning_rate": 0.001, + "loss": 2.1112, + "num_input_tokens_seen": 46444386848, + "step": 88600 + }, + { + "epoch": 0.8457254612018078, + "grad_norm": 0.13516731560230255, + "learning_rate": 0.001, + "loss": 2.1163, + "num_input_tokens_seen": 46470594016, + "step": 88650 + }, + { + "epoch": 0.8462024637179961, + "grad_norm": 0.1436593383550644, + "learning_rate": 0.001, + "loss": 2.1127, + "num_input_tokens_seen": 46496808416, + "step": 88700 + }, + { + "epoch": 0.8466794662341843, + "grad_norm": 0.14031122624874115, + "learning_rate": 0.001, + "loss": 2.1108, + "num_input_tokens_seen": 46523019488, + "step": 88750 + }, + { + "epoch": 0.8471564687503726, + "grad_norm": 0.15727658569812775, + "learning_rate": 0.001, + "loss": 2.1129, + "num_input_tokens_seen": 46549230464, + "step": 88800 + }, + { + "epoch": 0.847633471266561, + "grad_norm": 0.13983725011348724, + "learning_rate": 0.001, + "loss": 2.1146, + "num_input_tokens_seen": 46575440160, + "step": 88850 + }, + { + "epoch": 0.8481104737827492, + "grad_norm": 0.14959338307380676, + "learning_rate": 0.001, + "loss": 2.1149, + "num_input_tokens_seen": 46601647200, + "step": 88900 + }, + { + "epoch": 0.8485874762989375, + "grad_norm": 0.1365756243467331, + "learning_rate": 0.001, + "loss": 2.1123, + "num_input_tokens_seen": 46627859680, + "step": 88950 + }, + { + "epoch": 0.8490644788151257, + "grad_norm": 0.14246727526187897, + "learning_rate": 0.001, + "loss": 2.1075, + "num_input_tokens_seen": 46654069856, + "step": 89000 + }, + { + "epoch": 0.8490644788151257, + "eval_loss": 2.0287179946899414, + "eval_runtime": 80.2662, + "eval_samples_per_second": 62.293, + "eval_steps_per_second": 15.573, + "num_input_tokens_seen": 46654069856, + "step": 89000 + }, + { + "epoch": 0.849541481331314, + "grad_norm": 0.15061074495315552, + "learning_rate": 0.001, + "loss": 2.1186, + "num_input_tokens_seen": 46680279040, + "step": 89050 + }, + { + "epoch": 0.8500184838475023, + "grad_norm": 0.16355903446674347, + "learning_rate": 0.001, + "loss": 2.1159, + "num_input_tokens_seen": 46706491232, + "step": 89100 + }, + { + "epoch": 0.8504954863636905, + "grad_norm": 0.14321939647197723, + "learning_rate": 0.001, + "loss": 2.115, + "num_input_tokens_seen": 46732690880, + "step": 89150 + }, + { + "epoch": 0.8509724888798789, + "grad_norm": 0.15475858747959137, + "learning_rate": 0.001, + "loss": 2.1201, + "num_input_tokens_seen": 46758901536, + "step": 89200 + }, + { + "epoch": 0.8514494913960671, + "grad_norm": 0.14698758721351624, + "learning_rate": 0.001, + "loss": 2.1247, + "num_input_tokens_seen": 46785111584, + "step": 89250 + }, + { + "epoch": 0.8519264939122554, + "grad_norm": 0.13974907994270325, + "learning_rate": 0.001, + "loss": 2.1172, + "num_input_tokens_seen": 46811325984, + "step": 89300 + }, + { + "epoch": 0.8524034964284437, + "grad_norm": 0.1385921984910965, + "learning_rate": 0.001, + "loss": 2.1104, + "num_input_tokens_seen": 46837523616, + "step": 89350 + }, + { + "epoch": 0.8528804989446319, + "grad_norm": 0.1460406333208084, + "learning_rate": 0.001, + "loss": 2.1144, + "num_input_tokens_seen": 46863725792, + "step": 89400 + }, + { + "epoch": 0.8533575014608202, + "grad_norm": 0.1514638215303421, + "learning_rate": 0.001, + "loss": 2.1061, + "num_input_tokens_seen": 46889930432, + "step": 89450 + }, + { + "epoch": 0.8538345039770084, + "grad_norm": 0.15317507088184357, + "learning_rate": 0.001, + "loss": 2.1109, + "num_input_tokens_seen": 46916134592, + "step": 89500 + }, + { + "epoch": 0.8538345039770084, + "eval_loss": 2.0291860103607178, + "eval_runtime": 79.9263, + "eval_samples_per_second": 62.558, + "eval_steps_per_second": 15.639, + "num_input_tokens_seen": 46916134592, + "step": 89500 + }, + { + "epoch": 0.8543115064931968, + "grad_norm": 0.14707760512828827, + "learning_rate": 0.001, + "loss": 2.1161, + "num_input_tokens_seen": 46942348992, + "step": 89550 + }, + { + "epoch": 0.8547885090093851, + "grad_norm": 0.13737894594669342, + "learning_rate": 0.001, + "loss": 2.106, + "num_input_tokens_seen": 46968561760, + "step": 89600 + }, + { + "epoch": 0.8552655115255733, + "grad_norm": 0.13339094817638397, + "learning_rate": 0.001, + "loss": 2.1091, + "num_input_tokens_seen": 46994765888, + "step": 89650 + }, + { + "epoch": 0.8557425140417616, + "grad_norm": 0.14085884392261505, + "learning_rate": 0.001, + "loss": 2.1081, + "num_input_tokens_seen": 47020978208, + "step": 89700 + }, + { + "epoch": 0.8562195165579498, + "grad_norm": 0.13842567801475525, + "learning_rate": 0.001, + "loss": 2.1173, + "num_input_tokens_seen": 47047190592, + "step": 89750 + }, + { + "epoch": 0.8566965190741381, + "grad_norm": 0.13960140943527222, + "learning_rate": 0.001, + "loss": 2.1069, + "num_input_tokens_seen": 47073403584, + "step": 89800 + }, + { + "epoch": 0.8571735215903263, + "grad_norm": 0.1304618865251541, + "learning_rate": 0.001, + "loss": 2.1108, + "num_input_tokens_seen": 47099616384, + "step": 89850 + }, + { + "epoch": 0.8576505241065147, + "grad_norm": 0.13719524443149567, + "learning_rate": 0.001, + "loss": 2.1156, + "num_input_tokens_seen": 47125823008, + "step": 89900 + }, + { + "epoch": 0.858127526622703, + "grad_norm": 0.1370345801115036, + "learning_rate": 0.001, + "loss": 2.1146, + "num_input_tokens_seen": 47152034368, + "step": 89950 + }, + { + "epoch": 0.8586045291388912, + "grad_norm": 0.13640981912612915, + "learning_rate": 0.001, + "loss": 2.1208, + "num_input_tokens_seen": 47178248768, + "step": 90000 + }, + { + "epoch": 0.8586045291388912, + "eval_loss": 2.0290136337280273, + "eval_runtime": 79.9812, + "eval_samples_per_second": 62.515, + "eval_steps_per_second": 15.629, + "num_input_tokens_seen": 47178248768, + "step": 90000 + }, + { + "epoch": 0.8590815316550795, + "grad_norm": 0.1481214165687561, + "learning_rate": 0.001, + "loss": 2.1111, + "num_input_tokens_seen": 47204461440, + "step": 90050 + }, + { + "epoch": 0.8595585341712677, + "grad_norm": 0.1385306715965271, + "learning_rate": 0.001, + "loss": 2.1111, + "num_input_tokens_seen": 47230673824, + "step": 90100 + }, + { + "epoch": 0.860035536687456, + "grad_norm": 0.15070556104183197, + "learning_rate": 0.001, + "loss": 2.1117, + "num_input_tokens_seen": 47256883968, + "step": 90150 + }, + { + "epoch": 0.8605125392036443, + "grad_norm": 0.14528563618659973, + "learning_rate": 0.001, + "loss": 2.1095, + "num_input_tokens_seen": 47283091808, + "step": 90200 + }, + { + "epoch": 0.8609895417198326, + "grad_norm": 0.13799038529396057, + "learning_rate": 0.001, + "loss": 2.108, + "num_input_tokens_seen": 47309298976, + "step": 90250 + }, + { + "epoch": 0.8614665442360209, + "grad_norm": 0.14631977677345276, + "learning_rate": 0.001, + "loss": 2.1033, + "num_input_tokens_seen": 47335513376, + "step": 90300 + }, + { + "epoch": 0.8619435467522091, + "grad_norm": 0.14224396646022797, + "learning_rate": 0.001, + "loss": 2.1157, + "num_input_tokens_seen": 47361721504, + "step": 90350 + }, + { + "epoch": 0.8624205492683974, + "grad_norm": 0.14005549252033234, + "learning_rate": 0.001, + "loss": 2.1103, + "num_input_tokens_seen": 47387932000, + "step": 90400 + }, + { + "epoch": 0.8628975517845857, + "grad_norm": 0.13566839694976807, + "learning_rate": 0.001, + "loss": 2.1042, + "num_input_tokens_seen": 47414144928, + "step": 90450 + }, + { + "epoch": 0.8633745543007739, + "grad_norm": 0.14136169850826263, + "learning_rate": 0.001, + "loss": 2.1056, + "num_input_tokens_seen": 47440355904, + "step": 90500 + }, + { + "epoch": 0.8633745543007739, + "eval_loss": 2.028252124786377, + "eval_runtime": 79.9732, + "eval_samples_per_second": 62.521, + "eval_steps_per_second": 15.63, + "num_input_tokens_seen": 47440355904, + "step": 90500 + }, + { + "epoch": 0.8638515568169622, + "grad_norm": 0.13514867424964905, + "learning_rate": 0.001, + "loss": 2.1127, + "num_input_tokens_seen": 47466570304, + "step": 90550 + }, + { + "epoch": 0.8643285593331504, + "grad_norm": 0.13685061037540436, + "learning_rate": 0.001, + "loss": 2.1115, + "num_input_tokens_seen": 47492780416, + "step": 90600 + }, + { + "epoch": 0.8648055618493388, + "grad_norm": 0.13860297203063965, + "learning_rate": 0.001, + "loss": 2.1125, + "num_input_tokens_seen": 47518992000, + "step": 90650 + }, + { + "epoch": 0.865282564365527, + "grad_norm": 0.13972771167755127, + "learning_rate": 0.001, + "loss": 2.1231, + "num_input_tokens_seen": 47545201600, + "step": 90700 + }, + { + "epoch": 0.8657595668817153, + "grad_norm": 0.14439938962459564, + "learning_rate": 0.001, + "loss": 2.1118, + "num_input_tokens_seen": 47571409888, + "step": 90750 + }, + { + "epoch": 0.8662365693979036, + "grad_norm": 0.1624486893415451, + "learning_rate": 0.001, + "loss": 2.1081, + "num_input_tokens_seen": 47597620352, + "step": 90800 + }, + { + "epoch": 0.8667135719140918, + "grad_norm": 0.1499566286802292, + "learning_rate": 0.001, + "loss": 2.1115, + "num_input_tokens_seen": 47623825344, + "step": 90850 + }, + { + "epoch": 0.8671905744302801, + "grad_norm": 0.14615119993686676, + "learning_rate": 0.001, + "loss": 2.1223, + "num_input_tokens_seen": 47650039392, + "step": 90900 + }, + { + "epoch": 0.8676675769464683, + "grad_norm": 0.14285366237163544, + "learning_rate": 0.001, + "loss": 2.1164, + "num_input_tokens_seen": 47676251808, + "step": 90950 + }, + { + "epoch": 0.8681445794626567, + "grad_norm": 0.13764303922653198, + "learning_rate": 0.001, + "loss": 2.1071, + "num_input_tokens_seen": 47702460032, + "step": 91000 + }, + { + "epoch": 0.8681445794626567, + "eval_loss": 2.0275421142578125, + "eval_runtime": 80.4452, + "eval_samples_per_second": 62.154, + "eval_steps_per_second": 15.539, + "num_input_tokens_seen": 47702460032, + "step": 91000 + }, + { + "epoch": 0.868621581978845, + "grad_norm": 0.146665558218956, + "learning_rate": 0.001, + "loss": 2.1203, + "num_input_tokens_seen": 47728669632, + "step": 91050 + }, + { + "epoch": 0.8690985844950332, + "grad_norm": 0.15256023406982422, + "learning_rate": 0.001, + "loss": 2.1144, + "num_input_tokens_seen": 47754867168, + "step": 91100 + }, + { + "epoch": 0.8695755870112215, + "grad_norm": 0.14853611588478088, + "learning_rate": 0.001, + "loss": 2.1136, + "num_input_tokens_seen": 47781081056, + "step": 91150 + }, + { + "epoch": 0.8700525895274097, + "grad_norm": 0.16396841406822205, + "learning_rate": 0.001, + "loss": 2.1116, + "num_input_tokens_seen": 47807287712, + "step": 91200 + }, + { + "epoch": 0.870529592043598, + "grad_norm": 0.15022516250610352, + "learning_rate": 0.001, + "loss": 2.1237, + "num_input_tokens_seen": 47833489472, + "step": 91250 + }, + { + "epoch": 0.8710065945597864, + "grad_norm": 0.1452953815460205, + "learning_rate": 0.001, + "loss": 2.1158, + "num_input_tokens_seen": 47859697664, + "step": 91300 + }, + { + "epoch": 0.8714835970759746, + "grad_norm": 0.14615213871002197, + "learning_rate": 0.001, + "loss": 2.1081, + "num_input_tokens_seen": 47885910336, + "step": 91350 + }, + { + "epoch": 0.8719605995921629, + "grad_norm": 0.14509530365467072, + "learning_rate": 0.001, + "loss": 2.1182, + "num_input_tokens_seen": 47912122848, + "step": 91400 + }, + { + "epoch": 0.8724376021083511, + "grad_norm": 0.14017629623413086, + "learning_rate": 0.001, + "loss": 2.1148, + "num_input_tokens_seen": 47938323776, + "step": 91450 + }, + { + "epoch": 0.8729146046245394, + "grad_norm": 0.14566557109355927, + "learning_rate": 0.001, + "loss": 2.1069, + "num_input_tokens_seen": 47964538176, + "step": 91500 + }, + { + "epoch": 0.8729146046245394, + "eval_loss": 2.027026414871216, + "eval_runtime": 79.9343, + "eval_samples_per_second": 62.551, + "eval_steps_per_second": 15.638, + "num_input_tokens_seen": 47964538176, + "step": 91500 + }, + { + "epoch": 0.8733916071407277, + "grad_norm": 0.1404925286769867, + "learning_rate": 0.001, + "loss": 2.1115, + "num_input_tokens_seen": 47990748160, + "step": 91550 + }, + { + "epoch": 0.8738686096569159, + "grad_norm": 0.14250437915325165, + "learning_rate": 0.001, + "loss": 2.1123, + "num_input_tokens_seen": 48016960768, + "step": 91600 + }, + { + "epoch": 0.8743456121731042, + "grad_norm": 0.14528650045394897, + "learning_rate": 0.001, + "loss": 2.1101, + "num_input_tokens_seen": 48043175072, + "step": 91650 + }, + { + "epoch": 0.8748226146892925, + "grad_norm": 0.13685448467731476, + "learning_rate": 0.001, + "loss": 2.1151, + "num_input_tokens_seen": 48069383456, + "step": 91700 + }, + { + "epoch": 0.8752996172054808, + "grad_norm": 0.14179499447345734, + "learning_rate": 0.001, + "loss": 2.1015, + "num_input_tokens_seen": 48095593984, + "step": 91750 + }, + { + "epoch": 0.875776619721669, + "grad_norm": 0.1447928100824356, + "learning_rate": 0.001, + "loss": 2.1114, + "num_input_tokens_seen": 48121802368, + "step": 91800 + }, + { + "epoch": 0.8762536222378573, + "grad_norm": 0.13791429996490479, + "learning_rate": 0.001, + "loss": 2.1099, + "num_input_tokens_seen": 48148014048, + "step": 91850 + }, + { + "epoch": 0.8767306247540456, + "grad_norm": 0.16552382707595825, + "learning_rate": 0.001, + "loss": 2.1114, + "num_input_tokens_seen": 48174228448, + "step": 91900 + }, + { + "epoch": 0.8772076272702338, + "grad_norm": 0.14140479266643524, + "learning_rate": 0.001, + "loss": 2.1097, + "num_input_tokens_seen": 48200439552, + "step": 91950 + }, + { + "epoch": 0.8776846297864221, + "grad_norm": 0.14821244776248932, + "learning_rate": 0.001, + "loss": 2.1003, + "num_input_tokens_seen": 48226649920, + "step": 92000 + }, + { + "epoch": 0.8776846297864221, + "eval_loss": 2.0305159091949463, + "eval_runtime": 79.559, + "eval_samples_per_second": 62.846, + "eval_steps_per_second": 15.712, + "num_input_tokens_seen": 48226649920, + "step": 92000 + }, + { + "epoch": 0.8781616323026104, + "grad_norm": 0.15452982485294342, + "learning_rate": 0.001, + "loss": 2.1059, + "num_input_tokens_seen": 48252864320, + "step": 92050 + }, + { + "epoch": 0.8786386348187987, + "grad_norm": 0.13859480619430542, + "learning_rate": 0.001, + "loss": 2.1111, + "num_input_tokens_seen": 48279076256, + "step": 92100 + }, + { + "epoch": 0.879115637334987, + "grad_norm": 0.13759450614452362, + "learning_rate": 0.001, + "loss": 2.1025, + "num_input_tokens_seen": 48305290432, + "step": 92150 + }, + { + "epoch": 0.8795926398511752, + "grad_norm": 0.14123345911502838, + "learning_rate": 0.001, + "loss": 2.1072, + "num_input_tokens_seen": 48331503296, + "step": 92200 + }, + { + "epoch": 0.8800696423673635, + "grad_norm": 0.14411857724189758, + "learning_rate": 0.001, + "loss": 2.1119, + "num_input_tokens_seen": 48357715264, + "step": 92250 + }, + { + "epoch": 0.8805466448835517, + "grad_norm": 0.14408078789710999, + "learning_rate": 0.001, + "loss": 2.1135, + "num_input_tokens_seen": 48383927936, + "step": 92300 + }, + { + "epoch": 0.88102364739974, + "grad_norm": 0.1592986285686493, + "learning_rate": 0.001, + "loss": 2.1052, + "num_input_tokens_seen": 48410142336, + "step": 92350 + }, + { + "epoch": 0.8815006499159284, + "grad_norm": 0.15246793627738953, + "learning_rate": 0.001, + "loss": 2.1082, + "num_input_tokens_seen": 48436356736, + "step": 92400 + }, + { + "epoch": 0.8819776524321166, + "grad_norm": 0.14505651593208313, + "learning_rate": 0.001, + "loss": 2.1158, + "num_input_tokens_seen": 48462565600, + "step": 92450 + }, + { + "epoch": 0.8824546549483049, + "grad_norm": 0.13918310403823853, + "learning_rate": 0.001, + "loss": 2.102, + "num_input_tokens_seen": 48488770560, + "step": 92500 + }, + { + "epoch": 0.8824546549483049, + "eval_loss": 2.0255165100097656, + "eval_runtime": 80.9026, + "eval_samples_per_second": 61.803, + "eval_steps_per_second": 15.451, + "num_input_tokens_seen": 48488770560, + "step": 92500 + }, + { + "epoch": 0.8829316574644931, + "grad_norm": 0.14701320230960846, + "learning_rate": 0.001, + "loss": 2.1127, + "num_input_tokens_seen": 48514976384, + "step": 92550 + }, + { + "epoch": 0.8834086599806814, + "grad_norm": 0.14102083444595337, + "learning_rate": 0.001, + "loss": 2.1091, + "num_input_tokens_seen": 48541187392, + "step": 92600 + }, + { + "epoch": 0.8838856624968696, + "grad_norm": 0.14818648993968964, + "learning_rate": 0.001, + "loss": 2.1097, + "num_input_tokens_seen": 48567390624, + "step": 92650 + }, + { + "epoch": 0.8843626650130579, + "grad_norm": 0.15168142318725586, + "learning_rate": 0.001, + "loss": 2.125, + "num_input_tokens_seen": 48593598688, + "step": 92700 + }, + { + "epoch": 0.8848396675292463, + "grad_norm": 0.14235848188400269, + "learning_rate": 0.001, + "loss": 2.1114, + "num_input_tokens_seen": 48619802048, + "step": 92750 + }, + { + "epoch": 0.8853166700454345, + "grad_norm": 0.15503767132759094, + "learning_rate": 0.001, + "loss": 2.1129, + "num_input_tokens_seen": 48646006848, + "step": 92800 + }, + { + "epoch": 0.8857936725616228, + "grad_norm": 0.13856704533100128, + "learning_rate": 0.001, + "loss": 2.1055, + "num_input_tokens_seen": 48672218720, + "step": 92850 + }, + { + "epoch": 0.886270675077811, + "grad_norm": 0.14264176785945892, + "learning_rate": 0.001, + "loss": 2.1163, + "num_input_tokens_seen": 48698417760, + "step": 92900 + }, + { + "epoch": 0.8867476775939993, + "grad_norm": 0.13919401168823242, + "learning_rate": 0.001, + "loss": 2.1061, + "num_input_tokens_seen": 48724617440, + "step": 92950 + }, + { + "epoch": 0.8872246801101876, + "grad_norm": 0.1560058891773224, + "learning_rate": 0.001, + "loss": 2.1141, + "num_input_tokens_seen": 48750829152, + "step": 93000 + }, + { + "epoch": 0.8872246801101876, + "eval_loss": 2.0258500576019287, + "eval_runtime": 80.6659, + "eval_samples_per_second": 61.984, + "eval_steps_per_second": 15.496, + "num_input_tokens_seen": 48750829152, + "step": 93000 + }, + { + "epoch": 0.8877016826263758, + "grad_norm": 0.13532903790473938, + "learning_rate": 0.001, + "loss": 2.11, + "num_input_tokens_seen": 48777041568, + "step": 93050 + }, + { + "epoch": 0.8881786851425642, + "grad_norm": 0.15079811215400696, + "learning_rate": 0.001, + "loss": 2.1125, + "num_input_tokens_seen": 48803252992, + "step": 93100 + }, + { + "epoch": 0.8886556876587524, + "grad_norm": 0.14035262167453766, + "learning_rate": 0.001, + "loss": 2.1126, + "num_input_tokens_seen": 48829457344, + "step": 93150 + }, + { + "epoch": 0.8891326901749407, + "grad_norm": 0.14490865170955658, + "learning_rate": 0.001, + "loss": 2.1058, + "num_input_tokens_seen": 48855661760, + "step": 93200 + }, + { + "epoch": 0.889609692691129, + "grad_norm": 0.14975398778915405, + "learning_rate": 0.001, + "loss": 2.1079, + "num_input_tokens_seen": 48881871328, + "step": 93250 + }, + { + "epoch": 0.8900866952073172, + "grad_norm": 0.14744842052459717, + "learning_rate": 0.001, + "loss": 2.1128, + "num_input_tokens_seen": 48908085728, + "step": 93300 + }, + { + "epoch": 0.8905636977235055, + "grad_norm": 0.14297208189964294, + "learning_rate": 0.001, + "loss": 2.1188, + "num_input_tokens_seen": 48934300128, + "step": 93350 + }, + { + "epoch": 0.8910407002396937, + "grad_norm": 0.1417332887649536, + "learning_rate": 0.001, + "loss": 2.1138, + "num_input_tokens_seen": 48960513184, + "step": 93400 + }, + { + "epoch": 0.891517702755882, + "grad_norm": 0.1589946150779724, + "learning_rate": 0.001, + "loss": 2.1187, + "num_input_tokens_seen": 48986726080, + "step": 93450 + }, + { + "epoch": 0.8919947052720704, + "grad_norm": 0.14446181058883667, + "learning_rate": 0.001, + "loss": 2.1241, + "num_input_tokens_seen": 49012940480, + "step": 93500 + }, + { + "epoch": 0.8919947052720704, + "eval_loss": 2.0254712104797363, + "eval_runtime": 79.7436, + "eval_samples_per_second": 62.701, + "eval_steps_per_second": 15.675, + "num_input_tokens_seen": 49012940480, + "step": 93500 + }, + { + "epoch": 0.8924717077882586, + "grad_norm": 0.15279428660869598, + "learning_rate": 0.001, + "loss": 2.1113, + "num_input_tokens_seen": 49039153024, + "step": 93550 + }, + { + "epoch": 0.8929487103044469, + "grad_norm": 0.1449560672044754, + "learning_rate": 0.001, + "loss": 2.1117, + "num_input_tokens_seen": 49065367424, + "step": 93600 + }, + { + "epoch": 0.8934257128206351, + "grad_norm": 0.13819773495197296, + "learning_rate": 0.001, + "loss": 2.0989, + "num_input_tokens_seen": 49091579424, + "step": 93650 + }, + { + "epoch": 0.8939027153368234, + "grad_norm": 0.13857555389404297, + "learning_rate": 0.001, + "loss": 2.1183, + "num_input_tokens_seen": 49117788864, + "step": 93700 + }, + { + "epoch": 0.8943797178530116, + "grad_norm": 0.14007195830345154, + "learning_rate": 0.001, + "loss": 2.1133, + "num_input_tokens_seen": 49143999488, + "step": 93750 + }, + { + "epoch": 0.8948567203691999, + "grad_norm": 0.13965079188346863, + "learning_rate": 0.001, + "loss": 2.1097, + "num_input_tokens_seen": 49170200000, + "step": 93800 + }, + { + "epoch": 0.8953337228853883, + "grad_norm": 0.1414870172739029, + "learning_rate": 0.001, + "loss": 2.1164, + "num_input_tokens_seen": 49196410944, + "step": 93850 + }, + { + "epoch": 0.8958107254015765, + "grad_norm": 0.15663990378379822, + "learning_rate": 0.001, + "loss": 2.1086, + "num_input_tokens_seen": 49222620992, + "step": 93900 + }, + { + "epoch": 0.8962877279177648, + "grad_norm": 0.15661224722862244, + "learning_rate": 0.001, + "loss": 2.1091, + "num_input_tokens_seen": 49248834304, + "step": 93950 + }, + { + "epoch": 0.896764730433953, + "grad_norm": 0.1401936262845993, + "learning_rate": 0.001, + "loss": 2.1154, + "num_input_tokens_seen": 49275046560, + "step": 94000 + }, + { + "epoch": 0.896764730433953, + "eval_loss": 2.022953510284424, + "eval_runtime": 80.2932, + "eval_samples_per_second": 62.272, + "eval_steps_per_second": 15.568, + "num_input_tokens_seen": 49275046560, + "step": 94000 + }, + { + "epoch": 0.8972417329501413, + "grad_norm": 0.1521204262971878, + "learning_rate": 0.001, + "loss": 2.1044, + "num_input_tokens_seen": 49301242560, + "step": 94050 + }, + { + "epoch": 0.8977187354663296, + "grad_norm": 0.1409813016653061, + "learning_rate": 0.001, + "loss": 2.1165, + "num_input_tokens_seen": 49327452352, + "step": 94100 + }, + { + "epoch": 0.8981957379825178, + "grad_norm": 0.1482156664133072, + "learning_rate": 0.001, + "loss": 2.1056, + "num_input_tokens_seen": 49353664096, + "step": 94150 + }, + { + "epoch": 0.8986727404987062, + "grad_norm": 0.15649978816509247, + "learning_rate": 0.001, + "loss": 2.1015, + "num_input_tokens_seen": 49379874368, + "step": 94200 + }, + { + "epoch": 0.8991497430148944, + "grad_norm": 0.1503802388906479, + "learning_rate": 0.001, + "loss": 2.1087, + "num_input_tokens_seen": 49406084320, + "step": 94250 + }, + { + "epoch": 0.8996267455310827, + "grad_norm": 0.1439296454191208, + "learning_rate": 0.001, + "loss": 2.1153, + "num_input_tokens_seen": 49432296992, + "step": 94300 + }, + { + "epoch": 0.900103748047271, + "grad_norm": 0.14262431859970093, + "learning_rate": 0.001, + "loss": 2.1128, + "num_input_tokens_seen": 49458508640, + "step": 94350 + }, + { + "epoch": 0.9005807505634592, + "grad_norm": 0.14529536664485931, + "learning_rate": 0.001, + "loss": 2.1072, + "num_input_tokens_seen": 49484719072, + "step": 94400 + }, + { + "epoch": 0.9010577530796475, + "grad_norm": 0.1515061855316162, + "learning_rate": 0.001, + "loss": 2.1033, + "num_input_tokens_seen": 49510932320, + "step": 94450 + }, + { + "epoch": 0.9015347555958357, + "grad_norm": 0.14318937063217163, + "learning_rate": 0.001, + "loss": 2.0967, + "num_input_tokens_seen": 49537142912, + "step": 94500 + }, + { + "epoch": 0.9015347555958357, + "eval_loss": 2.023240327835083, + "eval_runtime": 80.5534, + "eval_samples_per_second": 62.071, + "eval_steps_per_second": 15.518, + "num_input_tokens_seen": 49537142912, + "step": 94500 + }, + { + "epoch": 0.902011758112024, + "grad_norm": 0.14540798962116241, + "learning_rate": 0.001, + "loss": 2.1117, + "num_input_tokens_seen": 49563352672, + "step": 94550 + }, + { + "epoch": 0.9024887606282123, + "grad_norm": 0.15588590502738953, + "learning_rate": 0.001, + "loss": 2.1148, + "num_input_tokens_seen": 49589555168, + "step": 94600 + }, + { + "epoch": 0.9029657631444006, + "grad_norm": 0.14033040404319763, + "learning_rate": 0.001, + "loss": 2.1072, + "num_input_tokens_seen": 49615768512, + "step": 94650 + }, + { + "epoch": 0.9034427656605889, + "grad_norm": 0.1453922539949417, + "learning_rate": 0.001, + "loss": 2.1151, + "num_input_tokens_seen": 49641970368, + "step": 94700 + }, + { + "epoch": 0.9039197681767771, + "grad_norm": 0.14980725944042206, + "learning_rate": 0.001, + "loss": 2.1093, + "num_input_tokens_seen": 49668181024, + "step": 94750 + }, + { + "epoch": 0.9043967706929654, + "grad_norm": 0.14013737440109253, + "learning_rate": 0.001, + "loss": 2.1214, + "num_input_tokens_seen": 49694392704, + "step": 94800 + }, + { + "epoch": 0.9048737732091536, + "grad_norm": 0.13809648156166077, + "learning_rate": 0.001, + "loss": 2.1151, + "num_input_tokens_seen": 49720606848, + "step": 94850 + }, + { + "epoch": 0.905350775725342, + "grad_norm": 0.13267497718334198, + "learning_rate": 0.001, + "loss": 2.1058, + "num_input_tokens_seen": 49746811360, + "step": 94900 + }, + { + "epoch": 0.9058277782415303, + "grad_norm": 0.1532643884420395, + "learning_rate": 0.001, + "loss": 2.1101, + "num_input_tokens_seen": 49773025760, + "step": 94950 + }, + { + "epoch": 0.9063047807577185, + "grad_norm": 0.13915950059890747, + "learning_rate": 0.001, + "loss": 2.1101, + "num_input_tokens_seen": 49799231680, + "step": 95000 + }, + { + "epoch": 0.9063047807577185, + "eval_loss": 2.024012804031372, + "eval_runtime": 80.2612, + "eval_samples_per_second": 62.297, + "eval_steps_per_second": 15.574, + "num_input_tokens_seen": 49799231680, + "step": 95000 + }, + { + "epoch": 0.9067817832739068, + "grad_norm": 0.15056970715522766, + "learning_rate": 0.001, + "loss": 2.1047, + "num_input_tokens_seen": 49825444800, + "step": 95050 + }, + { + "epoch": 0.907258785790095, + "grad_norm": 0.13517211377620697, + "learning_rate": 0.001, + "loss": 2.0979, + "num_input_tokens_seen": 49851655808, + "step": 95100 + }, + { + "epoch": 0.9077357883062833, + "grad_norm": 0.13956350088119507, + "learning_rate": 0.001, + "loss": 2.1086, + "num_input_tokens_seen": 49877867392, + "step": 95150 + }, + { + "epoch": 0.9082127908224716, + "grad_norm": 0.1523425281047821, + "learning_rate": 0.001, + "loss": 2.0936, + "num_input_tokens_seen": 49904076064, + "step": 95200 + }, + { + "epoch": 0.9086897933386598, + "grad_norm": 0.15285497903823853, + "learning_rate": 0.001, + "loss": 2.1048, + "num_input_tokens_seen": 49930288160, + "step": 95250 + }, + { + "epoch": 0.9091667958548482, + "grad_norm": 0.14413221180438995, + "learning_rate": 0.001, + "loss": 2.0946, + "num_input_tokens_seen": 49956501184, + "step": 95300 + }, + { + "epoch": 0.9096437983710364, + "grad_norm": 0.1461506485939026, + "learning_rate": 0.001, + "loss": 2.1084, + "num_input_tokens_seen": 49982711360, + "step": 95350 + }, + { + "epoch": 0.9101208008872247, + "grad_norm": 0.13794639706611633, + "learning_rate": 0.001, + "loss": 2.1014, + "num_input_tokens_seen": 50008921440, + "step": 95400 + }, + { + "epoch": 0.9105978034034129, + "grad_norm": 0.14720895886421204, + "learning_rate": 0.001, + "loss": 2.1017, + "num_input_tokens_seen": 50035120864, + "step": 95450 + }, + { + "epoch": 0.9110748059196012, + "grad_norm": 0.14016789197921753, + "learning_rate": 0.001, + "loss": 2.1087, + "num_input_tokens_seen": 50061331936, + "step": 95500 + }, + { + "epoch": 0.9110748059196012, + "eval_loss": 2.0230298042297363, + "eval_runtime": 79.7075, + "eval_samples_per_second": 62.729, + "eval_steps_per_second": 15.682, + "num_input_tokens_seen": 50061331936, + "step": 95500 + }, + { + "epoch": 0.9115518084357895, + "grad_norm": 0.15129534900188446, + "learning_rate": 0.001, + "loss": 2.1033, + "num_input_tokens_seen": 50087536032, + "step": 95550 + }, + { + "epoch": 0.9120288109519777, + "grad_norm": 0.14662089943885803, + "learning_rate": 0.001, + "loss": 2.1099, + "num_input_tokens_seen": 50113747872, + "step": 95600 + }, + { + "epoch": 0.9125058134681661, + "grad_norm": 0.15536580979824066, + "learning_rate": 0.001, + "loss": 2.1292, + "num_input_tokens_seen": 50139961728, + "step": 95650 + }, + { + "epoch": 0.9129828159843543, + "grad_norm": 0.14354456961154938, + "learning_rate": 0.001, + "loss": 2.105, + "num_input_tokens_seen": 50166173152, + "step": 95700 + }, + { + "epoch": 0.9134598185005426, + "grad_norm": 0.15019798278808594, + "learning_rate": 0.001, + "loss": 2.1131, + "num_input_tokens_seen": 50192384928, + "step": 95750 + }, + { + "epoch": 0.9139368210167309, + "grad_norm": 0.13612700998783112, + "learning_rate": 0.001, + "loss": 2.1122, + "num_input_tokens_seen": 50218597600, + "step": 95800 + }, + { + "epoch": 0.9144138235329191, + "grad_norm": 0.1439824402332306, + "learning_rate": 0.001, + "loss": 2.1018, + "num_input_tokens_seen": 50244810528, + "step": 95850 + }, + { + "epoch": 0.9148908260491074, + "grad_norm": 0.15556299686431885, + "learning_rate": 0.001, + "loss": 2.1064, + "num_input_tokens_seen": 50271019008, + "step": 95900 + }, + { + "epoch": 0.9153678285652956, + "grad_norm": 0.1479777693748474, + "learning_rate": 0.001, + "loss": 2.1111, + "num_input_tokens_seen": 50297228896, + "step": 95950 + }, + { + "epoch": 0.915844831081484, + "grad_norm": 0.14080928266048431, + "learning_rate": 0.001, + "loss": 2.1011, + "num_input_tokens_seen": 50323443136, + "step": 96000 + }, + { + "epoch": 0.915844831081484, + "eval_loss": 2.022911310195923, + "eval_runtime": 79.6192, + "eval_samples_per_second": 62.799, + "eval_steps_per_second": 15.7, + "num_input_tokens_seen": 50323443136, + "step": 96000 + }, + { + "epoch": 0.9163218335976723, + "grad_norm": 0.14819885790348053, + "learning_rate": 0.001, + "loss": 2.1138, + "num_input_tokens_seen": 50349653824, + "step": 96050 + }, + { + "epoch": 0.9167988361138605, + "grad_norm": 0.1338687390089035, + "learning_rate": 0.001, + "loss": 2.1068, + "num_input_tokens_seen": 50375857728, + "step": 96100 + }, + { + "epoch": 0.9172758386300488, + "grad_norm": 0.14282946288585663, + "learning_rate": 0.001, + "loss": 2.1155, + "num_input_tokens_seen": 50402064096, + "step": 96150 + }, + { + "epoch": 0.917752841146237, + "grad_norm": 0.137980654835701, + "learning_rate": 0.001, + "loss": 2.0963, + "num_input_tokens_seen": 50428274592, + "step": 96200 + }, + { + "epoch": 0.9182298436624253, + "grad_norm": 0.1530529260635376, + "learning_rate": 0.001, + "loss": 2.1074, + "num_input_tokens_seen": 50454485120, + "step": 96250 + }, + { + "epoch": 0.9187068461786136, + "grad_norm": 0.15306447446346283, + "learning_rate": 0.001, + "loss": 2.1107, + "num_input_tokens_seen": 50480694976, + "step": 96300 + }, + { + "epoch": 0.9191838486948019, + "grad_norm": 0.13567544519901276, + "learning_rate": 0.001, + "loss": 2.1087, + "num_input_tokens_seen": 50506903392, + "step": 96350 + }, + { + "epoch": 0.9196608512109902, + "grad_norm": 0.14647279679775238, + "learning_rate": 0.001, + "loss": 2.1087, + "num_input_tokens_seen": 50533115264, + "step": 96400 + }, + { + "epoch": 0.9201378537271784, + "grad_norm": 0.14072103798389435, + "learning_rate": 0.001, + "loss": 2.1003, + "num_input_tokens_seen": 50559324224, + "step": 96450 + }, + { + "epoch": 0.9206148562433667, + "grad_norm": 0.1334242820739746, + "learning_rate": 0.001, + "loss": 2.1052, + "num_input_tokens_seen": 50585537568, + "step": 96500 + }, + { + "epoch": 0.9206148562433667, + "eval_loss": 2.0225579738616943, + "eval_runtime": 79.9486, + "eval_samples_per_second": 62.54, + "eval_steps_per_second": 15.635, + "num_input_tokens_seen": 50585537568, + "step": 96500 + }, + { + "epoch": 0.9210918587595549, + "grad_norm": 0.1523120254278183, + "learning_rate": 0.001, + "loss": 2.098, + "num_input_tokens_seen": 50611746976, + "step": 96550 + }, + { + "epoch": 0.9215688612757432, + "grad_norm": 0.14341165125370026, + "learning_rate": 0.001, + "loss": 2.1105, + "num_input_tokens_seen": 50637952320, + "step": 96600 + }, + { + "epoch": 0.9220458637919315, + "grad_norm": 0.15297015011310577, + "learning_rate": 0.001, + "loss": 2.1116, + "num_input_tokens_seen": 50664162720, + "step": 96650 + }, + { + "epoch": 0.9225228663081197, + "grad_norm": 0.15151242911815643, + "learning_rate": 0.001, + "loss": 2.1109, + "num_input_tokens_seen": 50690366624, + "step": 96700 + }, + { + "epoch": 0.9229998688243081, + "grad_norm": 0.14462804794311523, + "learning_rate": 0.001, + "loss": 2.117, + "num_input_tokens_seen": 50716579904, + "step": 96750 + }, + { + "epoch": 0.9234768713404963, + "grad_norm": 0.1390417069196701, + "learning_rate": 0.001, + "loss": 2.1016, + "num_input_tokens_seen": 50742794304, + "step": 96800 + }, + { + "epoch": 0.9239538738566846, + "grad_norm": 0.14151330292224884, + "learning_rate": 0.001, + "loss": 2.1032, + "num_input_tokens_seen": 50769000928, + "step": 96850 + }, + { + "epoch": 0.9244308763728729, + "grad_norm": 0.1432236135005951, + "learning_rate": 0.001, + "loss": 2.1133, + "num_input_tokens_seen": 50795209632, + "step": 96900 + }, + { + "epoch": 0.9249078788890611, + "grad_norm": 0.14917080104351044, + "learning_rate": 0.001, + "loss": 2.1092, + "num_input_tokens_seen": 50821418560, + "step": 96950 + }, + { + "epoch": 0.9253848814052494, + "grad_norm": 0.14105528593063354, + "learning_rate": 0.001, + "loss": 2.1024, + "num_input_tokens_seen": 50847631776, + "step": 97000 + }, + { + "epoch": 0.9253848814052494, + "eval_loss": 2.0225133895874023, + "eval_runtime": 79.6596, + "eval_samples_per_second": 62.767, + "eval_steps_per_second": 15.692, + "num_input_tokens_seen": 50847631776, + "step": 97000 + }, + { + "epoch": 0.9258618839214376, + "grad_norm": 0.1577770859003067, + "learning_rate": 0.001, + "loss": 2.1079, + "num_input_tokens_seen": 50873841088, + "step": 97050 + }, + { + "epoch": 0.926338886437626, + "grad_norm": 0.13983358442783356, + "learning_rate": 0.001, + "loss": 2.1037, + "num_input_tokens_seen": 50900050112, + "step": 97100 + }, + { + "epoch": 0.9268158889538143, + "grad_norm": 0.14196738600730896, + "learning_rate": 0.001, + "loss": 2.109, + "num_input_tokens_seen": 50926261280, + "step": 97150 + }, + { + "epoch": 0.9272928914700025, + "grad_norm": 0.1525181531906128, + "learning_rate": 0.001, + "loss": 2.1104, + "num_input_tokens_seen": 50952473344, + "step": 97200 + }, + { + "epoch": 0.9277698939861908, + "grad_norm": 0.14153936505317688, + "learning_rate": 0.001, + "loss": 2.1188, + "num_input_tokens_seen": 50978687744, + "step": 97250 + }, + { + "epoch": 0.928246896502379, + "grad_norm": 0.13389533758163452, + "learning_rate": 0.001, + "loss": 2.1066, + "num_input_tokens_seen": 51004893696, + "step": 97300 + }, + { + "epoch": 0.9287238990185673, + "grad_norm": 0.1618724912405014, + "learning_rate": 0.001, + "loss": 2.109, + "num_input_tokens_seen": 51031102752, + "step": 97350 + }, + { + "epoch": 0.9292009015347555, + "grad_norm": 0.146076038479805, + "learning_rate": 0.001, + "loss": 2.1104, + "num_input_tokens_seen": 51057314816, + "step": 97400 + }, + { + "epoch": 0.9296779040509439, + "grad_norm": 0.14311222732067108, + "learning_rate": 0.001, + "loss": 2.1074, + "num_input_tokens_seen": 51083520832, + "step": 97450 + }, + { + "epoch": 0.9301549065671322, + "grad_norm": 0.1500881314277649, + "learning_rate": 0.001, + "loss": 2.1032, + "num_input_tokens_seen": 51109732512, + "step": 97500 + }, + { + "epoch": 0.9301549065671322, + "eval_loss": 2.022352695465088, + "eval_runtime": 80.43, + "eval_samples_per_second": 62.166, + "eval_steps_per_second": 15.541, + "num_input_tokens_seen": 51109732512, + "step": 97500 + }, + { + "epoch": 0.9306319090833204, + "grad_norm": 0.15114109218120575, + "learning_rate": 0.001, + "loss": 2.109, + "num_input_tokens_seen": 51135938816, + "step": 97550 + }, + { + "epoch": 0.9311089115995087, + "grad_norm": 0.1397593915462494, + "learning_rate": 0.001, + "loss": 2.1089, + "num_input_tokens_seen": 51162148352, + "step": 97600 + }, + { + "epoch": 0.9315859141156969, + "grad_norm": 0.13163454830646515, + "learning_rate": 0.001, + "loss": 2.0974, + "num_input_tokens_seen": 51188357216, + "step": 97650 + }, + { + "epoch": 0.9320629166318852, + "grad_norm": 0.1726287603378296, + "learning_rate": 0.001, + "loss": 2.1258, + "num_input_tokens_seen": 51214571616, + "step": 97700 + }, + { + "epoch": 0.9325399191480735, + "grad_norm": 0.14246715605258942, + "learning_rate": 0.001, + "loss": 2.0959, + "num_input_tokens_seen": 51240780352, + "step": 97750 + }, + { + "epoch": 0.9330169216642618, + "grad_norm": 0.13136450946331024, + "learning_rate": 0.001, + "loss": 2.1083, + "num_input_tokens_seen": 51266988320, + "step": 97800 + }, + { + "epoch": 0.9334939241804501, + "grad_norm": 0.15461480617523193, + "learning_rate": 0.001, + "loss": 2.106, + "num_input_tokens_seen": 51293202720, + "step": 97850 + }, + { + "epoch": 0.9339709266966383, + "grad_norm": 0.14553368091583252, + "learning_rate": 0.001, + "loss": 2.1029, + "num_input_tokens_seen": 51319415616, + "step": 97900 + }, + { + "epoch": 0.9344479292128266, + "grad_norm": 0.14998067915439606, + "learning_rate": 0.001, + "loss": 2.1174, + "num_input_tokens_seen": 51345619808, + "step": 97950 + }, + { + "epoch": 0.9349249317290149, + "grad_norm": 0.14988018572330475, + "learning_rate": 0.001, + "loss": 2.0984, + "num_input_tokens_seen": 51371830432, + "step": 98000 + }, + { + "epoch": 0.9349249317290149, + "eval_loss": 2.02105712890625, + "eval_runtime": 80.5093, + "eval_samples_per_second": 62.105, + "eval_steps_per_second": 15.526, + "num_input_tokens_seen": 51371830432, + "step": 98000 + }, + { + "epoch": 0.9354019342452031, + "grad_norm": 0.1521526575088501, + "learning_rate": 0.001, + "loss": 2.0988, + "num_input_tokens_seen": 51398036448, + "step": 98050 + }, + { + "epoch": 0.9358789367613914, + "grad_norm": 0.13992975652217865, + "learning_rate": 0.001, + "loss": 2.1074, + "num_input_tokens_seen": 51424242880, + "step": 98100 + }, + { + "epoch": 0.9363559392775797, + "grad_norm": 0.1415923684835434, + "learning_rate": 0.001, + "loss": 2.0952, + "num_input_tokens_seen": 51450457280, + "step": 98150 + }, + { + "epoch": 0.936832941793768, + "grad_norm": 0.15722358226776123, + "learning_rate": 0.001, + "loss": 2.1148, + "num_input_tokens_seen": 51476663840, + "step": 98200 + }, + { + "epoch": 0.9373099443099563, + "grad_norm": 0.14942607283592224, + "learning_rate": 0.001, + "loss": 2.1009, + "num_input_tokens_seen": 51502876256, + "step": 98250 + }, + { + "epoch": 0.9377869468261445, + "grad_norm": 0.1397363543510437, + "learning_rate": 0.001, + "loss": 2.1083, + "num_input_tokens_seen": 51529086944, + "step": 98300 + }, + { + "epoch": 0.9382639493423328, + "grad_norm": 0.14004074037075043, + "learning_rate": 0.001, + "loss": 2.1009, + "num_input_tokens_seen": 51555301344, + "step": 98350 + }, + { + "epoch": 0.938740951858521, + "grad_norm": 0.15313847362995148, + "learning_rate": 0.001, + "loss": 2.1016, + "num_input_tokens_seen": 51581506560, + "step": 98400 + }, + { + "epoch": 0.9392179543747093, + "grad_norm": 0.1391165405511856, + "learning_rate": 0.001, + "loss": 2.1054, + "num_input_tokens_seen": 51607720960, + "step": 98450 + }, + { + "epoch": 0.9396949568908975, + "grad_norm": 0.15387007594108582, + "learning_rate": 0.001, + "loss": 2.1133, + "num_input_tokens_seen": 51633931744, + "step": 98500 + }, + { + "epoch": 0.9396949568908975, + "eval_loss": 2.0206305980682373, + "eval_runtime": 79.5875, + "eval_samples_per_second": 62.824, + "eval_steps_per_second": 15.706, + "num_input_tokens_seen": 51633931744, + "step": 98500 + }, + { + "epoch": 0.9401719594070859, + "grad_norm": 0.15347087383270264, + "learning_rate": 0.001, + "loss": 2.1066, + "num_input_tokens_seen": 51660140800, + "step": 98550 + }, + { + "epoch": 0.9406489619232742, + "grad_norm": 0.14395572245121002, + "learning_rate": 0.001, + "loss": 2.104, + "num_input_tokens_seen": 51686349760, + "step": 98600 + }, + { + "epoch": 0.9411259644394624, + "grad_norm": 0.1397567093372345, + "learning_rate": 0.001, + "loss": 2.1054, + "num_input_tokens_seen": 51712553888, + "step": 98650 + }, + { + "epoch": 0.9416029669556507, + "grad_norm": 0.14795741438865662, + "learning_rate": 0.001, + "loss": 2.0911, + "num_input_tokens_seen": 51738759968, + "step": 98700 + }, + { + "epoch": 0.9420799694718389, + "grad_norm": 0.15419213473796844, + "learning_rate": 0.001, + "loss": 2.1068, + "num_input_tokens_seen": 51764972992, + "step": 98750 + }, + { + "epoch": 0.9425569719880272, + "grad_norm": 0.14047859609127045, + "learning_rate": 0.001, + "loss": 2.1007, + "num_input_tokens_seen": 51791183968, + "step": 98800 + }, + { + "epoch": 0.9430339745042156, + "grad_norm": 0.15431874990463257, + "learning_rate": 0.001, + "loss": 2.1006, + "num_input_tokens_seen": 51817396704, + "step": 98850 + }, + { + "epoch": 0.9435109770204038, + "grad_norm": 0.14634360373020172, + "learning_rate": 0.001, + "loss": 2.113, + "num_input_tokens_seen": 51843609440, + "step": 98900 + }, + { + "epoch": 0.9439879795365921, + "grad_norm": 0.13855423033237457, + "learning_rate": 0.001, + "loss": 2.0972, + "num_input_tokens_seen": 51869820832, + "step": 98950 + }, + { + "epoch": 0.9444649820527803, + "grad_norm": 0.14774784445762634, + "learning_rate": 0.001, + "loss": 2.1038, + "num_input_tokens_seen": 51896030848, + "step": 99000 + }, + { + "epoch": 0.9444649820527803, + "eval_loss": 2.020467519760132, + "eval_runtime": 80.0716, + "eval_samples_per_second": 62.444, + "eval_steps_per_second": 15.611, + "num_input_tokens_seen": 51896030848, + "step": 99000 + }, + { + "epoch": 0.9449419845689686, + "grad_norm": 0.12909556925296783, + "learning_rate": 0.001, + "loss": 2.1117, + "num_input_tokens_seen": 51922241632, + "step": 99050 + }, + { + "epoch": 0.9454189870851569, + "grad_norm": 0.14898623526096344, + "learning_rate": 0.001, + "loss": 2.098, + "num_input_tokens_seen": 51948454816, + "step": 99100 + }, + { + "epoch": 0.9458959896013451, + "grad_norm": 0.1472884863615036, + "learning_rate": 0.001, + "loss": 2.1143, + "num_input_tokens_seen": 51974667424, + "step": 99150 + }, + { + "epoch": 0.9463729921175335, + "grad_norm": 0.15299226343631744, + "learning_rate": 0.001, + "loss": 2.1064, + "num_input_tokens_seen": 52000872800, + "step": 99200 + }, + { + "epoch": 0.9468499946337217, + "grad_norm": 0.15084148943424225, + "learning_rate": 0.001, + "loss": 2.108, + "num_input_tokens_seen": 52027087200, + "step": 99250 + }, + { + "epoch": 0.94732699714991, + "grad_norm": 0.14663194119930267, + "learning_rate": 0.001, + "loss": 2.1039, + "num_input_tokens_seen": 52053301600, + "step": 99300 + }, + { + "epoch": 0.9478039996660982, + "grad_norm": 0.1480923891067505, + "learning_rate": 0.001, + "loss": 2.103, + "num_input_tokens_seen": 52079512832, + "step": 99350 + }, + { + "epoch": 0.9482810021822865, + "grad_norm": 0.14708015322685242, + "learning_rate": 0.001, + "loss": 2.1194, + "num_input_tokens_seen": 52105726016, + "step": 99400 + }, + { + "epoch": 0.9487580046984748, + "grad_norm": 0.1524500995874405, + "learning_rate": 0.001, + "loss": 2.1092, + "num_input_tokens_seen": 52131935744, + "step": 99450 + }, + { + "epoch": 0.949235007214663, + "grad_norm": 0.15018624067306519, + "learning_rate": 0.001, + "loss": 2.1021, + "num_input_tokens_seen": 52158147168, + "step": 99500 + }, + { + "epoch": 0.949235007214663, + "eval_loss": 2.022200345993042, + "eval_runtime": 79.553, + "eval_samples_per_second": 62.851, + "eval_steps_per_second": 15.713, + "num_input_tokens_seen": 52158147168, + "step": 99500 + }, + { + "epoch": 0.9497120097308513, + "grad_norm": 0.13732361793518066, + "learning_rate": 0.001, + "loss": 2.1062, + "num_input_tokens_seen": 52184356352, + "step": 99550 + }, + { + "epoch": 0.9501890122470396, + "grad_norm": 0.14235976338386536, + "learning_rate": 0.001, + "loss": 2.1168, + "num_input_tokens_seen": 52210570752, + "step": 99600 + }, + { + "epoch": 0.9506660147632279, + "grad_norm": 0.1550482213497162, + "learning_rate": 0.001, + "loss": 2.1053, + "num_input_tokens_seen": 52236785152, + "step": 99650 + }, + { + "epoch": 0.9511430172794162, + "grad_norm": 0.1557578146457672, + "learning_rate": 0.001, + "loss": 2.1026, + "num_input_tokens_seen": 52262997760, + "step": 99700 + }, + { + "epoch": 0.9516200197956044, + "grad_norm": 0.1451166272163391, + "learning_rate": 0.001, + "loss": 2.0997, + "num_input_tokens_seen": 52289205056, + "step": 99750 + }, + { + "epoch": 0.9520970223117927, + "grad_norm": 0.15717899799346924, + "learning_rate": 0.001, + "loss": 2.1088, + "num_input_tokens_seen": 52315417376, + "step": 99800 + }, + { + "epoch": 0.9525740248279809, + "grad_norm": 0.14595787227153778, + "learning_rate": 0.001, + "loss": 2.1015, + "num_input_tokens_seen": 52341619488, + "step": 99850 + }, + { + "epoch": 0.9530510273441692, + "grad_norm": 0.1477060317993164, + "learning_rate": 0.001, + "loss": 2.0991, + "num_input_tokens_seen": 52367830048, + "step": 99900 + }, + { + "epoch": 0.9535280298603576, + "grad_norm": 0.15001972019672394, + "learning_rate": 0.001, + "loss": 2.1039, + "num_input_tokens_seen": 52394036064, + "step": 99950 + }, + { + "epoch": 0.9540050323765458, + "grad_norm": 0.13796518743038177, + "learning_rate": 0.001, + "loss": 2.1116, + "num_input_tokens_seen": 52420247968, + "step": 100000 + }, + { + "epoch": 0.9540050323765458, + "eval_loss": 2.0211336612701416, + "eval_runtime": 80.1109, + "eval_samples_per_second": 62.413, + "eval_steps_per_second": 15.603, + "num_input_tokens_seen": 52420247968, + "step": 100000 + }, + { + "epoch": 0.9544820348927341, + "grad_norm": 0.13650113344192505, + "learning_rate": 0.001, + "loss": 2.1018, + "num_input_tokens_seen": 52446460160, + "step": 100050 + }, + { + "epoch": 0.9549590374089223, + "grad_norm": 0.14391744136810303, + "learning_rate": 0.001, + "loss": 2.1004, + "num_input_tokens_seen": 52472671616, + "step": 100100 + }, + { + "epoch": 0.9554360399251106, + "grad_norm": 0.14373421669006348, + "learning_rate": 0.001, + "loss": 2.1157, + "num_input_tokens_seen": 52498882464, + "step": 100150 + }, + { + "epoch": 0.9559130424412989, + "grad_norm": 0.14512939751148224, + "learning_rate": 0.001, + "loss": 2.0999, + "num_input_tokens_seen": 52525095328, + "step": 100200 + }, + { + "epoch": 0.9563900449574871, + "grad_norm": 0.14848357439041138, + "learning_rate": 0.001, + "loss": 2.1042, + "num_input_tokens_seen": 52551306848, + "step": 100250 + }, + { + "epoch": 0.9568670474736755, + "grad_norm": 0.14673751592636108, + "learning_rate": 0.001, + "loss": 2.0989, + "num_input_tokens_seen": 52577512416, + "step": 100300 + }, + { + "epoch": 0.9573440499898637, + "grad_norm": 0.13780055940151215, + "learning_rate": 0.001, + "loss": 2.1075, + "num_input_tokens_seen": 52603721312, + "step": 100350 + }, + { + "epoch": 0.957821052506052, + "grad_norm": 0.33724644780158997, + "learning_rate": 0.001, + "loss": 2.0947, + "num_input_tokens_seen": 52629927296, + "step": 100400 + }, + { + "epoch": 0.9582980550222402, + "grad_norm": 0.14704908430576324, + "learning_rate": 0.001, + "loss": 2.102, + "num_input_tokens_seen": 52656138304, + "step": 100450 + }, + { + "epoch": 0.9587750575384285, + "grad_norm": 0.14224405586719513, + "learning_rate": 0.001, + "loss": 2.1026, + "num_input_tokens_seen": 52682343360, + "step": 100500 + }, + { + "epoch": 0.9587750575384285, + "eval_loss": 2.0196518898010254, + "eval_runtime": 79.8344, + "eval_samples_per_second": 62.63, + "eval_steps_per_second": 15.657, + "num_input_tokens_seen": 52682343360, + "step": 100500 + }, + { + "epoch": 0.9592520600546168, + "grad_norm": 0.14282722771167755, + "learning_rate": 0.001, + "loss": 2.0973, + "num_input_tokens_seen": 52708556800, + "step": 100550 + }, + { + "epoch": 0.959729062570805, + "grad_norm": 0.1467045545578003, + "learning_rate": 0.001, + "loss": 2.1036, + "num_input_tokens_seen": 52734764768, + "step": 100600 + }, + { + "epoch": 0.9602060650869934, + "grad_norm": 0.16190816462039948, + "learning_rate": 0.001, + "loss": 2.1033, + "num_input_tokens_seen": 52760968160, + "step": 100650 + }, + { + "epoch": 0.9606830676031816, + "grad_norm": 0.1406693458557129, + "learning_rate": 0.001, + "loss": 2.105, + "num_input_tokens_seen": 52787178912, + "step": 100700 + }, + { + "epoch": 0.9611600701193699, + "grad_norm": 0.15562649071216583, + "learning_rate": 0.001, + "loss": 2.1021, + "num_input_tokens_seen": 52813388832, + "step": 100750 + }, + { + "epoch": 0.9616370726355582, + "grad_norm": 0.15426361560821533, + "learning_rate": 0.001, + "loss": 2.0955, + "num_input_tokens_seen": 52839602848, + "step": 100800 + }, + { + "epoch": 0.9621140751517464, + "grad_norm": 0.14050264656543732, + "learning_rate": 0.001, + "loss": 2.1041, + "num_input_tokens_seen": 52865814016, + "step": 100850 + }, + { + "epoch": 0.9625910776679347, + "grad_norm": 0.1460646390914917, + "learning_rate": 0.001, + "loss": 2.1052, + "num_input_tokens_seen": 52892026784, + "step": 100900 + }, + { + "epoch": 0.9630680801841229, + "grad_norm": 0.14038728177547455, + "learning_rate": 0.001, + "loss": 2.1044, + "num_input_tokens_seen": 52918238624, + "step": 100950 + }, + { + "epoch": 0.9635450827003113, + "grad_norm": 0.15031979978084564, + "learning_rate": 0.001, + "loss": 2.1031, + "num_input_tokens_seen": 52944447232, + "step": 101000 + }, + { + "epoch": 0.9635450827003113, + "eval_loss": 2.020580530166626, + "eval_runtime": 79.7901, + "eval_samples_per_second": 62.664, + "eval_steps_per_second": 15.666, + "num_input_tokens_seen": 52944447232, + "step": 101000 + }, + { + "epoch": 0.9640220852164996, + "grad_norm": 0.1539318561553955, + "learning_rate": 0.001, + "loss": 2.0988, + "num_input_tokens_seen": 52970660704, + "step": 101050 + }, + { + "epoch": 0.9644990877326878, + "grad_norm": 0.16137336194515228, + "learning_rate": 0.001, + "loss": 2.1055, + "num_input_tokens_seen": 52996875104, + "step": 101100 + }, + { + "epoch": 0.9649760902488761, + "grad_norm": 0.14159482717514038, + "learning_rate": 0.001, + "loss": 2.1, + "num_input_tokens_seen": 53023078848, + "step": 101150 + }, + { + "epoch": 0.9654530927650643, + "grad_norm": 0.1358761340379715, + "learning_rate": 0.001, + "loss": 2.0979, + "num_input_tokens_seen": 53049291552, + "step": 101200 + }, + { + "epoch": 0.9659300952812526, + "grad_norm": 0.14332178235054016, + "learning_rate": 0.001, + "loss": 2.1029, + "num_input_tokens_seen": 53075499328, + "step": 101250 + }, + { + "epoch": 0.9664070977974408, + "grad_norm": 0.14132554829120636, + "learning_rate": 0.001, + "loss": 2.1036, + "num_input_tokens_seen": 53101712448, + "step": 101300 + }, + { + "epoch": 0.9668841003136291, + "grad_norm": 0.15662212669849396, + "learning_rate": 0.001, + "loss": 2.1036, + "num_input_tokens_seen": 53127918176, + "step": 101350 + }, + { + "epoch": 0.9673611028298175, + "grad_norm": 0.14150604605674744, + "learning_rate": 0.001, + "loss": 2.1108, + "num_input_tokens_seen": 53154132192, + "step": 101400 + }, + { + "epoch": 0.9678381053460057, + "grad_norm": 0.14353616535663605, + "learning_rate": 0.001, + "loss": 2.1046, + "num_input_tokens_seen": 53180344448, + "step": 101450 + }, + { + "epoch": 0.968315107862194, + "grad_norm": 0.13792720437049866, + "learning_rate": 0.001, + "loss": 2.1127, + "num_input_tokens_seen": 53206553568, + "step": 101500 + }, + { + "epoch": 0.968315107862194, + "eval_loss": 2.0184688568115234, + "eval_runtime": 79.7642, + "eval_samples_per_second": 62.685, + "eval_steps_per_second": 15.671, + "num_input_tokens_seen": 53206553568, + "step": 101500 + }, + { + "epoch": 0.9687921103783822, + "grad_norm": 0.13691678643226624, + "learning_rate": 0.001, + "loss": 2.1006, + "num_input_tokens_seen": 53232767200, + "step": 101550 + }, + { + "epoch": 0.9692691128945705, + "grad_norm": 0.1575424075126648, + "learning_rate": 0.001, + "loss": 2.0954, + "num_input_tokens_seen": 53258974944, + "step": 101600 + }, + { + "epoch": 0.9697461154107588, + "grad_norm": 0.15218724310398102, + "learning_rate": 0.001, + "loss": 2.0979, + "num_input_tokens_seen": 53285187296, + "step": 101650 + }, + { + "epoch": 0.970223117926947, + "grad_norm": 0.1503322720527649, + "learning_rate": 0.001, + "loss": 2.1027, + "num_input_tokens_seen": 53311394144, + "step": 101700 + }, + { + "epoch": 0.9707001204431354, + "grad_norm": 0.16736505925655365, + "learning_rate": 0.001, + "loss": 2.1031, + "num_input_tokens_seen": 53337604896, + "step": 101750 + }, + { + "epoch": 0.9711771229593236, + "grad_norm": 0.14036568999290466, + "learning_rate": 0.001, + "loss": 2.1171, + "num_input_tokens_seen": 53363816960, + "step": 101800 + }, + { + "epoch": 0.9716541254755119, + "grad_norm": 0.1408475935459137, + "learning_rate": 0.001, + "loss": 2.0966, + "num_input_tokens_seen": 53390028096, + "step": 101850 + }, + { + "epoch": 0.9721311279917002, + "grad_norm": 0.146541029214859, + "learning_rate": 0.001, + "loss": 2.1082, + "num_input_tokens_seen": 53416241344, + "step": 101900 + }, + { + "epoch": 0.9726081305078884, + "grad_norm": 0.1453496813774109, + "learning_rate": 0.001, + "loss": 2.0955, + "num_input_tokens_seen": 53442449632, + "step": 101950 + }, + { + "epoch": 0.9730851330240767, + "grad_norm": 0.14003250002861023, + "learning_rate": 0.001, + "loss": 2.0992, + "num_input_tokens_seen": 53468658336, + "step": 102000 + }, + { + "epoch": 0.9730851330240767, + "eval_loss": 2.0179874897003174, + "eval_runtime": 80.0076, + "eval_samples_per_second": 62.494, + "eval_steps_per_second": 15.624, + "num_input_tokens_seen": 53468658336, + "step": 102000 + }, + { + "epoch": 0.9735621355402649, + "grad_norm": 0.14326773583889008, + "learning_rate": 0.001, + "loss": 2.0932, + "num_input_tokens_seen": 53494871072, + "step": 102050 + }, + { + "epoch": 0.9740391380564533, + "grad_norm": 0.13583138585090637, + "learning_rate": 0.001, + "loss": 2.0911, + "num_input_tokens_seen": 53521073504, + "step": 102100 + }, + { + "epoch": 0.9745161405726416, + "grad_norm": 0.13492164015769958, + "learning_rate": 0.001, + "loss": 2.1079, + "num_input_tokens_seen": 53547285632, + "step": 102150 + }, + { + "epoch": 0.9749931430888298, + "grad_norm": 0.1361209750175476, + "learning_rate": 0.001, + "loss": 2.1075, + "num_input_tokens_seen": 53573498688, + "step": 102200 + }, + { + "epoch": 0.9754701456050181, + "grad_norm": 0.15009672939777374, + "learning_rate": 0.001, + "loss": 2.1096, + "num_input_tokens_seen": 53599709344, + "step": 102250 + }, + { + "epoch": 0.9759471481212063, + "grad_norm": 0.14739353954792023, + "learning_rate": 0.001, + "loss": 2.1113, + "num_input_tokens_seen": 53625921312, + "step": 102300 + }, + { + "epoch": 0.9764241506373946, + "grad_norm": 0.13327108323574066, + "learning_rate": 0.001, + "loss": 2.0993, + "num_input_tokens_seen": 53652133472, + "step": 102350 + }, + { + "epoch": 0.9769011531535828, + "grad_norm": 0.15938664972782135, + "learning_rate": 0.001, + "loss": 2.1044, + "num_input_tokens_seen": 53678346208, + "step": 102400 + }, + { + "epoch": 0.9773781556697712, + "grad_norm": 0.14256815612316132, + "learning_rate": 0.001, + "loss": 2.0995, + "num_input_tokens_seen": 53704558400, + "step": 102450 + }, + { + "epoch": 0.9778551581859595, + "grad_norm": 3.5962650775909424, + "learning_rate": 0.001, + "loss": 2.0968, + "num_input_tokens_seen": 53730772800, + "step": 102500 + }, + { + "epoch": 0.9778551581859595, + "eval_loss": 2.0211129188537598, + "eval_runtime": 80.3018, + "eval_samples_per_second": 62.265, + "eval_steps_per_second": 15.566, + "num_input_tokens_seen": 53730772800, + "step": 102500 + }, + { + "epoch": 0.9783321607021477, + "grad_norm": 0.14861001074314117, + "learning_rate": 0.001, + "loss": 2.1211, + "num_input_tokens_seen": 53756986432, + "step": 102550 + }, + { + "epoch": 0.978809163218336, + "grad_norm": 0.13950598239898682, + "learning_rate": 0.001, + "loss": 2.1026, + "num_input_tokens_seen": 53783199840, + "step": 102600 + }, + { + "epoch": 0.9792861657345242, + "grad_norm": 0.14435075223445892, + "learning_rate": 0.001, + "loss": 2.1062, + "num_input_tokens_seen": 53809407904, + "step": 102650 + }, + { + "epoch": 0.9797631682507125, + "grad_norm": 0.1405237913131714, + "learning_rate": 0.001, + "loss": 2.1138, + "num_input_tokens_seen": 53835613280, + "step": 102700 + }, + { + "epoch": 0.9802401707669008, + "grad_norm": 0.1563555896282196, + "learning_rate": 0.001, + "loss": 2.104, + "num_input_tokens_seen": 53861818848, + "step": 102750 + }, + { + "epoch": 0.980717173283089, + "grad_norm": 0.15651467442512512, + "learning_rate": 0.001, + "loss": 2.1134, + "num_input_tokens_seen": 53888025952, + "step": 102800 + }, + { + "epoch": 0.9811941757992774, + "grad_norm": 0.1491318941116333, + "learning_rate": 0.001, + "loss": 2.1179, + "num_input_tokens_seen": 53914237056, + "step": 102850 + }, + { + "epoch": 0.9816711783154656, + "grad_norm": 0.15486833453178406, + "learning_rate": 0.001, + "loss": 2.1086, + "num_input_tokens_seen": 53940442144, + "step": 102900 + }, + { + "epoch": 0.9821481808316539, + "grad_norm": 0.14997461438179016, + "learning_rate": 0.001, + "loss": 2.1029, + "num_input_tokens_seen": 53966652960, + "step": 102950 + }, + { + "epoch": 0.9826251833478422, + "grad_norm": 0.16969485580921173, + "learning_rate": 0.001, + "loss": 2.092, + "num_input_tokens_seen": 53992863264, + "step": 103000 + }, + { + "epoch": 0.9826251833478422, + "eval_loss": 2.017953634262085, + "eval_runtime": 80.0806, + "eval_samples_per_second": 62.437, + "eval_steps_per_second": 15.609, + "num_input_tokens_seen": 53992863264, + "step": 103000 + }, + { + "epoch": 0.9831021858640304, + "grad_norm": 0.1433195322751999, + "learning_rate": 0.001, + "loss": 2.1045, + "num_input_tokens_seen": 54019076320, + "step": 103050 + }, + { + "epoch": 0.9835791883802187, + "grad_norm": 0.13524982333183289, + "learning_rate": 0.001, + "loss": 2.1099, + "num_input_tokens_seen": 54045286592, + "step": 103100 + }, + { + "epoch": 0.984056190896407, + "grad_norm": 0.13636088371276855, + "learning_rate": 0.001, + "loss": 2.1139, + "num_input_tokens_seen": 54071491616, + "step": 103150 + }, + { + "epoch": 0.9845331934125953, + "grad_norm": 0.14049679040908813, + "learning_rate": 0.001, + "loss": 2.1031, + "num_input_tokens_seen": 54097697184, + "step": 103200 + }, + { + "epoch": 0.9850101959287835, + "grad_norm": 0.13444900512695312, + "learning_rate": 0.001, + "loss": 2.0941, + "num_input_tokens_seen": 54123909408, + "step": 103250 + }, + { + "epoch": 0.9854871984449718, + "grad_norm": 0.136467844247818, + "learning_rate": 0.001, + "loss": 2.1055, + "num_input_tokens_seen": 54150121152, + "step": 103300 + }, + { + "epoch": 0.9859642009611601, + "grad_norm": 0.14821065962314606, + "learning_rate": 0.001, + "loss": 2.1062, + "num_input_tokens_seen": 54176333824, + "step": 103350 + }, + { + "epoch": 0.9864412034773483, + "grad_norm": 0.15114334225654602, + "learning_rate": 0.001, + "loss": 2.0973, + "num_input_tokens_seen": 54202545376, + "step": 103400 + }, + { + "epoch": 0.9869182059935366, + "grad_norm": 0.13535051047801971, + "learning_rate": 0.001, + "loss": 2.1059, + "num_input_tokens_seen": 54228754304, + "step": 103450 + }, + { + "epoch": 0.9873952085097248, + "grad_norm": 0.13229253888130188, + "learning_rate": 0.001, + "loss": 2.1016, + "num_input_tokens_seen": 54254966464, + "step": 103500 + }, + { + "epoch": 0.9873952085097248, + "eval_loss": 2.0163238048553467, + "eval_runtime": 80.7299, + "eval_samples_per_second": 61.935, + "eval_steps_per_second": 15.484, + "num_input_tokens_seen": 54254966464, + "step": 103500 + }, + { + "epoch": 0.9878722110259132, + "grad_norm": 0.13689298927783966, + "learning_rate": 0.001, + "loss": 2.0935, + "num_input_tokens_seen": 54281180864, + "step": 103550 + }, + { + "epoch": 0.9883492135421015, + "grad_norm": 0.13803903758525848, + "learning_rate": 0.001, + "loss": 2.1036, + "num_input_tokens_seen": 54307394080, + "step": 103600 + }, + { + "epoch": 0.9888262160582897, + "grad_norm": 0.1357845515012741, + "learning_rate": 0.001, + "loss": 2.0962, + "num_input_tokens_seen": 54333608480, + "step": 103650 + }, + { + "epoch": 0.989303218574478, + "grad_norm": 0.14026238024234772, + "learning_rate": 0.001, + "loss": 2.1001, + "num_input_tokens_seen": 54359813856, + "step": 103700 + }, + { + "epoch": 0.9897802210906662, + "grad_norm": 0.14243866503238678, + "learning_rate": 0.001, + "loss": 2.1065, + "num_input_tokens_seen": 54386025280, + "step": 103750 + }, + { + "epoch": 0.9902572236068545, + "grad_norm": 0.14214347302913666, + "learning_rate": 0.001, + "loss": 2.1031, + "num_input_tokens_seen": 54412235712, + "step": 103800 + }, + { + "epoch": 0.9907342261230428, + "grad_norm": 0.13983015716075897, + "learning_rate": 0.001, + "loss": 2.0936, + "num_input_tokens_seen": 54438450112, + "step": 103850 + }, + { + "epoch": 0.9912112286392311, + "grad_norm": 0.15471301972866058, + "learning_rate": 0.001, + "loss": 2.0967, + "num_input_tokens_seen": 54464659648, + "step": 103900 + }, + { + "epoch": 0.9916882311554194, + "grad_norm": 0.14359566569328308, + "learning_rate": 0.001, + "loss": 2.0986, + "num_input_tokens_seen": 54490872800, + "step": 103950 + }, + { + "epoch": 0.9921652336716076, + "grad_norm": 0.149493008852005, + "learning_rate": 0.001, + "loss": 2.1098, + "num_input_tokens_seen": 54517083360, + "step": 104000 + }, + { + "epoch": 0.9921652336716076, + "eval_loss": 2.0173633098602295, + "eval_runtime": 79.9475, + "eval_samples_per_second": 62.541, + "eval_steps_per_second": 15.635, + "num_input_tokens_seen": 54517083360, + "step": 104000 + }, + { + "epoch": 0.9926422361877959, + "grad_norm": 0.1350771188735962, + "learning_rate": 0.001, + "loss": 2.0914, + "num_input_tokens_seen": 54543296096, + "step": 104050 + }, + { + "epoch": 0.9931192387039841, + "grad_norm": 0.14034296572208405, + "learning_rate": 0.001, + "loss": 2.1122, + "num_input_tokens_seen": 54569506560, + "step": 104100 + }, + { + "epoch": 0.9935962412201724, + "grad_norm": 0.14050833880901337, + "learning_rate": 0.001, + "loss": 2.1046, + "num_input_tokens_seen": 54595720960, + "step": 104150 + }, + { + "epoch": 0.9940732437363607, + "grad_norm": 0.1437423974275589, + "learning_rate": 0.001, + "loss": 2.0914, + "num_input_tokens_seen": 54621935360, + "step": 104200 + }, + { + "epoch": 0.994550246252549, + "grad_norm": 0.1375901997089386, + "learning_rate": 0.001, + "loss": 2.1054, + "num_input_tokens_seen": 54648141216, + "step": 104250 + }, + { + "epoch": 0.9950272487687373, + "grad_norm": 0.14355972409248352, + "learning_rate": 0.001, + "loss": 2.1057, + "num_input_tokens_seen": 54674354528, + "step": 104300 + }, + { + "epoch": 0.9955042512849255, + "grad_norm": 0.1537715196609497, + "learning_rate": 0.001, + "loss": 2.102, + "num_input_tokens_seen": 54700562880, + "step": 104350 + }, + { + "epoch": 0.9959812538011138, + "grad_norm": 0.14420664310455322, + "learning_rate": 0.001, + "loss": 2.1207, + "num_input_tokens_seen": 54726772416, + "step": 104400 + }, + { + "epoch": 0.9964582563173021, + "grad_norm": 0.14655081927776337, + "learning_rate": 0.001, + "loss": 2.0958, + "num_input_tokens_seen": 54752977152, + "step": 104450 + }, + { + "epoch": 0.9969352588334903, + "grad_norm": 0.15202060341835022, + "learning_rate": 0.001, + "loss": 2.1103, + "num_input_tokens_seen": 54779191328, + "step": 104500 + }, + { + "epoch": 0.9969352588334903, + "eval_loss": 2.0176143646240234, + "eval_runtime": 79.7652, + "eval_samples_per_second": 62.684, + "eval_steps_per_second": 15.671, + "num_input_tokens_seen": 54779191328, + "step": 104500 + }, + { + "epoch": 0.9974122613496786, + "grad_norm": 0.15168489515781403, + "learning_rate": 0.001, + "loss": 2.0964, + "num_input_tokens_seen": 54805401728, + "step": 104550 + }, + { + "epoch": 0.9978892638658668, + "grad_norm": 0.14086578786373138, + "learning_rate": 0.001, + "loss": 2.1098, + "num_input_tokens_seen": 54831615168, + "step": 104600 + }, + { + "epoch": 0.9983662663820552, + "grad_norm": 0.14561446011066437, + "learning_rate": 0.001, + "loss": 2.0925, + "num_input_tokens_seen": 54857821280, + "step": 104650 + }, + { + "epoch": 0.9988432688982435, + "grad_norm": 0.16160067915916443, + "learning_rate": 0.001, + "loss": 2.1016, + "num_input_tokens_seen": 54884026784, + "step": 104700 + }, + { + "epoch": 0.9993202714144317, + "grad_norm": 0.14295999705791473, + "learning_rate": 0.001, + "loss": 2.0871, + "num_input_tokens_seen": 54910241184, + "step": 104750 + }, + { + "epoch": 0.99979727393062, + "grad_norm": 0.163029745221138, + "learning_rate": 0.001, + "loss": 2.1087, + "num_input_tokens_seen": 54936452640, + "step": 104800 + }, + { + "epoch": 1.0002766614593892, + "grad_norm": 0.14676038920879364, + "learning_rate": 0.001, + "loss": 2.1678, + "num_input_tokens_seen": 54962782880, + "step": 104850 + }, + { + "epoch": 1.0007536639755774, + "grad_norm": 0.14179593324661255, + "learning_rate": 0.001, + "loss": 2.0905, + "num_input_tokens_seen": 54988994528, + "step": 104900 + }, + { + "epoch": 1.0012306664917658, + "grad_norm": 0.1496460884809494, + "learning_rate": 0.001, + "loss": 2.0938, + "num_input_tokens_seen": 55015208928, + "step": 104950 + }, + { + "epoch": 1.001707669007954, + "grad_norm": 0.15026605129241943, + "learning_rate": 0.001, + "loss": 2.0879, + "num_input_tokens_seen": 55041423104, + "step": 105000 + }, + { + "epoch": 1.001707669007954, + "eval_loss": 2.0179128646850586, + "eval_runtime": 80.8772, + "eval_samples_per_second": 61.822, + "eval_steps_per_second": 15.456, + "num_input_tokens_seen": 55041423104, + "step": 105000 + }, + { + "epoch": 1.0021846715241423, + "grad_norm": 0.13333049416542053, + "learning_rate": 0.001, + "loss": 2.0942, + "num_input_tokens_seen": 55067629792, + "step": 105050 + }, + { + "epoch": 1.0026616740403305, + "grad_norm": 0.15017394721508026, + "learning_rate": 0.001, + "loss": 2.0944, + "num_input_tokens_seen": 55093844192, + "step": 105100 + }, + { + "epoch": 1.003138676556519, + "grad_norm": 0.14982599020004272, + "learning_rate": 0.001, + "loss": 2.0949, + "num_input_tokens_seen": 55120057696, + "step": 105150 + }, + { + "epoch": 1.003615679072707, + "grad_norm": 0.13318419456481934, + "learning_rate": 0.001, + "loss": 2.0903, + "num_input_tokens_seen": 55146267488, + "step": 105200 + }, + { + "epoch": 1.0040926815888953, + "grad_norm": 0.13913436233997345, + "learning_rate": 0.001, + "loss": 2.0905, + "num_input_tokens_seen": 55172480896, + "step": 105250 + }, + { + "epoch": 1.0045696841050837, + "grad_norm": 0.14818261563777924, + "learning_rate": 0.001, + "loss": 2.096, + "num_input_tokens_seen": 55198691328, + "step": 105300 + }, + { + "epoch": 1.005046686621272, + "grad_norm": 0.15057435631752014, + "learning_rate": 0.001, + "loss": 2.0929, + "num_input_tokens_seen": 55224898976, + "step": 105350 + }, + { + "epoch": 1.0055236891374602, + "grad_norm": 0.15632683038711548, + "learning_rate": 0.001, + "loss": 2.0921, + "num_input_tokens_seen": 55251107776, + "step": 105400 + }, + { + "epoch": 1.0060006916536486, + "grad_norm": 0.14498716592788696, + "learning_rate": 0.001, + "loss": 2.0977, + "num_input_tokens_seen": 55277321088, + "step": 105450 + }, + { + "epoch": 1.0064776941698368, + "grad_norm": 0.14519184827804565, + "learning_rate": 0.001, + "loss": 2.0964, + "num_input_tokens_seen": 55303527552, + "step": 105500 + }, + { + "epoch": 1.0064776941698368, + "eval_loss": 2.0157995223999023, + "eval_runtime": 80.5085, + "eval_samples_per_second": 62.105, + "eval_steps_per_second": 15.526, + "num_input_tokens_seen": 55303527552, + "step": 105500 + }, + { + "epoch": 1.006954696686025, + "grad_norm": 0.15698741376399994, + "learning_rate": 0.001, + "loss": 2.0974, + "num_input_tokens_seen": 55329739936, + "step": 105550 + }, + { + "epoch": 1.0074316992022132, + "grad_norm": 0.1432969570159912, + "learning_rate": 0.001, + "loss": 2.0991, + "num_input_tokens_seen": 55355942944, + "step": 105600 + }, + { + "epoch": 1.0079087017184016, + "grad_norm": 0.13702726364135742, + "learning_rate": 0.001, + "loss": 2.0935, + "num_input_tokens_seen": 55382157344, + "step": 105650 + }, + { + "epoch": 1.0083857042345898, + "grad_norm": 0.14623892307281494, + "learning_rate": 0.001, + "loss": 2.0941, + "num_input_tokens_seen": 55408359808, + "step": 105700 + }, + { + "epoch": 1.008862706750778, + "grad_norm": 0.14641566574573517, + "learning_rate": 0.001, + "loss": 2.1119, + "num_input_tokens_seen": 55434571200, + "step": 105750 + }, + { + "epoch": 1.0093397092669665, + "grad_norm": 0.14116981625556946, + "learning_rate": 0.001, + "loss": 2.086, + "num_input_tokens_seen": 55460781792, + "step": 105800 + }, + { + "epoch": 1.0098167117831547, + "grad_norm": 0.14575734734535217, + "learning_rate": 0.001, + "loss": 2.1121, + "num_input_tokens_seen": 55486986944, + "step": 105850 + }, + { + "epoch": 1.010293714299343, + "grad_norm": 0.14387919008731842, + "learning_rate": 0.001, + "loss": 2.0949, + "num_input_tokens_seen": 55513194784, + "step": 105900 + }, + { + "epoch": 1.010770716815531, + "grad_norm": 0.14621268212795258, + "learning_rate": 0.001, + "loss": 2.0832, + "num_input_tokens_seen": 55539409184, + "step": 105950 + }, + { + "epoch": 1.0112477193317195, + "grad_norm": 0.1453128159046173, + "learning_rate": 0.001, + "loss": 2.0989, + "num_input_tokens_seen": 55565623584, + "step": 106000 + }, + { + "epoch": 1.0112477193317195, + "eval_loss": 2.0156850814819336, + "eval_runtime": 80.793, + "eval_samples_per_second": 61.887, + "eval_steps_per_second": 15.472, + "num_input_tokens_seen": 55565623584, + "step": 106000 + }, + { + "epoch": 1.0117247218479077, + "grad_norm": 0.1500042974948883, + "learning_rate": 0.001, + "loss": 2.0889, + "num_input_tokens_seen": 55591837984, + "step": 106050 + }, + { + "epoch": 1.012201724364096, + "grad_norm": 0.14505235850811005, + "learning_rate": 0.001, + "loss": 2.1002, + "num_input_tokens_seen": 55618045632, + "step": 106100 + }, + { + "epoch": 1.0126787268802844, + "grad_norm": 0.141426682472229, + "learning_rate": 0.001, + "loss": 2.091, + "num_input_tokens_seen": 55644253280, + "step": 106150 + }, + { + "epoch": 1.0131557293964726, + "grad_norm": 0.1420578509569168, + "learning_rate": 0.001, + "loss": 2.0987, + "num_input_tokens_seen": 55670466400, + "step": 106200 + }, + { + "epoch": 1.0136327319126608, + "grad_norm": 0.14393913745880127, + "learning_rate": 0.001, + "loss": 2.0841, + "num_input_tokens_seen": 55696673792, + "step": 106250 + }, + { + "epoch": 1.0141097344288492, + "grad_norm": 0.14071914553642273, + "learning_rate": 0.001, + "loss": 2.0893, + "num_input_tokens_seen": 55722877888, + "step": 106300 + }, + { + "epoch": 1.0145867369450374, + "grad_norm": 0.13708269596099854, + "learning_rate": 0.001, + "loss": 2.0979, + "num_input_tokens_seen": 55749083968, + "step": 106350 + }, + { + "epoch": 1.0150637394612256, + "grad_norm": 0.20387065410614014, + "learning_rate": 0.001, + "loss": 2.2896, + "num_input_tokens_seen": 55775290528, + "step": 106400 + }, + { + "epoch": 1.0155407419774138, + "grad_norm": 0.1379322111606598, + "learning_rate": 0.001, + "loss": 2.1229, + "num_input_tokens_seen": 55801504320, + "step": 106450 + }, + { + "epoch": 1.0160177444936023, + "grad_norm": 0.14372999966144562, + "learning_rate": 0.001, + "loss": 2.1079, + "num_input_tokens_seen": 55827708736, + "step": 106500 + }, + { + "epoch": 1.0160177444936023, + "eval_loss": 2.018646001815796, + "eval_runtime": 80.5516, + "eval_samples_per_second": 62.072, + "eval_steps_per_second": 15.518, + "num_input_tokens_seen": 55827708736, + "step": 106500 + }, + { + "epoch": 1.0164947470097905, + "grad_norm": 0.13034076988697052, + "learning_rate": 0.001, + "loss": 2.0805, + "num_input_tokens_seen": 55853917024, + "step": 106550 + }, + { + "epoch": 1.0169717495259787, + "grad_norm": 0.12940867245197296, + "learning_rate": 0.001, + "loss": 2.0837, + "num_input_tokens_seen": 55880125632, + "step": 106600 + }, + { + "epoch": 1.0174487520421671, + "grad_norm": 0.13367190957069397, + "learning_rate": 0.001, + "loss": 2.0991, + "num_input_tokens_seen": 55906331840, + "step": 106650 + }, + { + "epoch": 1.0179257545583553, + "grad_norm": 0.13605284690856934, + "learning_rate": 0.001, + "loss": 2.0973, + "num_input_tokens_seen": 55932528160, + "step": 106700 + }, + { + "epoch": 1.0184027570745435, + "grad_norm": 0.14544115960597992, + "learning_rate": 0.001, + "loss": 2.111, + "num_input_tokens_seen": 55958742560, + "step": 106750 + }, + { + "epoch": 1.0188797595907317, + "grad_norm": 0.14273081719875336, + "learning_rate": 0.001, + "loss": 2.0949, + "num_input_tokens_seen": 55984956960, + "step": 106800 + }, + { + "epoch": 1.0193567621069202, + "grad_norm": 0.14157475531101227, + "learning_rate": 0.001, + "loss": 2.0859, + "num_input_tokens_seen": 56011159744, + "step": 106850 + }, + { + "epoch": 1.0198337646231084, + "grad_norm": 0.13682667911052704, + "learning_rate": 0.001, + "loss": 2.0968, + "num_input_tokens_seen": 56037373184, + "step": 106900 + }, + { + "epoch": 1.0203107671392966, + "grad_norm": 0.14256510138511658, + "learning_rate": 0.001, + "loss": 2.0959, + "num_input_tokens_seen": 56063582688, + "step": 106950 + }, + { + "epoch": 1.020787769655485, + "grad_norm": 0.13422270119190216, + "learning_rate": 0.001, + "loss": 2.1069, + "num_input_tokens_seen": 56089794304, + "step": 107000 + }, + { + "epoch": 1.020787769655485, + "eval_loss": 2.0148062705993652, + "eval_runtime": 80.2529, + "eval_samples_per_second": 62.303, + "eval_steps_per_second": 15.576, + "num_input_tokens_seen": 56089794304, + "step": 107000 + }, + { + "epoch": 1.0212647721716732, + "grad_norm": 0.1368064284324646, + "learning_rate": 0.001, + "loss": 2.0883, + "num_input_tokens_seen": 56116003136, + "step": 107050 + }, + { + "epoch": 1.0217417746878614, + "grad_norm": 0.13491053879261017, + "learning_rate": 0.001, + "loss": 2.0913, + "num_input_tokens_seen": 56142210144, + "step": 107100 + }, + { + "epoch": 1.0222187772040499, + "grad_norm": 0.14345191419124603, + "learning_rate": 0.001, + "loss": 2.1019, + "num_input_tokens_seen": 56168416608, + "step": 107150 + }, + { + "epoch": 1.022695779720238, + "grad_norm": 0.14869827032089233, + "learning_rate": 0.001, + "loss": 2.0929, + "num_input_tokens_seen": 56194631008, + "step": 107200 + }, + { + "epoch": 1.0231727822364263, + "grad_norm": 0.14153461158275604, + "learning_rate": 0.001, + "loss": 2.0992, + "num_input_tokens_seen": 56220845408, + "step": 107250 + }, + { + "epoch": 1.0236497847526145, + "grad_norm": 0.1489809900522232, + "learning_rate": 0.001, + "loss": 2.0875, + "num_input_tokens_seen": 56247053216, + "step": 107300 + }, + { + "epoch": 1.024126787268803, + "grad_norm": 0.13485555350780487, + "learning_rate": 0.001, + "loss": 2.0983, + "num_input_tokens_seen": 56273267616, + "step": 107350 + }, + { + "epoch": 1.0246037897849911, + "grad_norm": 0.13658951222896576, + "learning_rate": 0.001, + "loss": 2.0901, + "num_input_tokens_seen": 56299482016, + "step": 107400 + }, + { + "epoch": 1.0250807923011793, + "grad_norm": 0.1356600672006607, + "learning_rate": 0.001, + "loss": 2.1042, + "num_input_tokens_seen": 56325696416, + "step": 107450 + }, + { + "epoch": 1.0255577948173678, + "grad_norm": 0.1527785360813141, + "learning_rate": 0.001, + "loss": 2.0987, + "num_input_tokens_seen": 56351910752, + "step": 107500 + }, + { + "epoch": 1.0255577948173678, + "eval_loss": 2.014725685119629, + "eval_runtime": 80.8852, + "eval_samples_per_second": 61.816, + "eval_steps_per_second": 15.454, + "num_input_tokens_seen": 56351910752, + "step": 107500 + }, + { + "epoch": 1.026034797333556, + "grad_norm": 0.1489991992712021, + "learning_rate": 0.001, + "loss": 2.0878, + "num_input_tokens_seen": 56378123968, + "step": 107550 + }, + { + "epoch": 1.0265117998497442, + "grad_norm": 0.1518663763999939, + "learning_rate": 0.001, + "loss": 2.1044, + "num_input_tokens_seen": 56404334496, + "step": 107600 + }, + { + "epoch": 1.0269888023659324, + "grad_norm": 0.13364924490451813, + "learning_rate": 0.001, + "loss": 2.0928, + "num_input_tokens_seen": 56430545248, + "step": 107650 + }, + { + "epoch": 1.0274658048821208, + "grad_norm": 1.7939748764038086, + "learning_rate": 0.001, + "loss": 2.1125, + "num_input_tokens_seen": 56456755488, + "step": 107700 + }, + { + "epoch": 1.027942807398309, + "grad_norm": 0.14331629872322083, + "learning_rate": 0.001, + "loss": 2.1346, + "num_input_tokens_seen": 56482965472, + "step": 107750 + }, + { + "epoch": 1.0284198099144972, + "grad_norm": 0.14626429975032806, + "learning_rate": 0.001, + "loss": 2.1037, + "num_input_tokens_seen": 56509177408, + "step": 107800 + }, + { + "epoch": 1.0288968124306856, + "grad_norm": 0.15549655258655548, + "learning_rate": 0.001, + "loss": 2.1006, + "num_input_tokens_seen": 56535387392, + "step": 107850 + }, + { + "epoch": 1.0293738149468739, + "grad_norm": 0.13863249123096466, + "learning_rate": 0.001, + "loss": 2.1056, + "num_input_tokens_seen": 56561593632, + "step": 107900 + }, + { + "epoch": 1.029850817463062, + "grad_norm": 0.1429344117641449, + "learning_rate": 0.001, + "loss": 2.0983, + "num_input_tokens_seen": 56587805408, + "step": 107950 + }, + { + "epoch": 1.0303278199792505, + "grad_norm": 0.14651237428188324, + "learning_rate": 0.001, + "loss": 2.095, + "num_input_tokens_seen": 56614015392, + "step": 108000 + }, + { + "epoch": 1.0303278199792505, + "eval_loss": 2.015336036682129, + "eval_runtime": 80.5904, + "eval_samples_per_second": 62.042, + "eval_steps_per_second": 15.511, + "num_input_tokens_seen": 56614015392, + "step": 108000 + }, + { + "epoch": 1.0308048224954387, + "grad_norm": 0.1352015882730484, + "learning_rate": 0.001, + "loss": 2.0908, + "num_input_tokens_seen": 56640217952, + "step": 108050 + }, + { + "epoch": 1.031281825011627, + "grad_norm": 0.13381287455558777, + "learning_rate": 0.001, + "loss": 2.0902, + "num_input_tokens_seen": 56666428640, + "step": 108100 + }, + { + "epoch": 1.0317588275278151, + "grad_norm": 0.14579468965530396, + "learning_rate": 0.001, + "loss": 2.0909, + "num_input_tokens_seen": 56692643040, + "step": 108150 + }, + { + "epoch": 1.0322358300440035, + "grad_norm": 0.14770351350307465, + "learning_rate": 0.001, + "loss": 2.0971, + "num_input_tokens_seen": 56718857440, + "step": 108200 + }, + { + "epoch": 1.0327128325601918, + "grad_norm": 0.14099901914596558, + "learning_rate": 0.001, + "loss": 2.0877, + "num_input_tokens_seen": 56745071840, + "step": 108250 + }, + { + "epoch": 1.03318983507638, + "grad_norm": 0.13044485449790955, + "learning_rate": 0.001, + "loss": 2.0861, + "num_input_tokens_seen": 56771285824, + "step": 108300 + }, + { + "epoch": 1.0336668375925684, + "grad_norm": 0.14185436069965363, + "learning_rate": 0.001, + "loss": 2.1007, + "num_input_tokens_seen": 56797497888, + "step": 108350 + }, + { + "epoch": 1.0341438401087566, + "grad_norm": 0.1411305069923401, + "learning_rate": 0.001, + "loss": 2.093, + "num_input_tokens_seen": 56823710368, + "step": 108400 + }, + { + "epoch": 1.0346208426249448, + "grad_norm": 0.13188087940216064, + "learning_rate": 0.001, + "loss": 2.0855, + "num_input_tokens_seen": 56849920512, + "step": 108450 + }, + { + "epoch": 1.035097845141133, + "grad_norm": 0.13814617693424225, + "learning_rate": 0.001, + "loss": 2.097, + "num_input_tokens_seen": 56876128320, + "step": 108500 + }, + { + "epoch": 1.035097845141133, + "eval_loss": 2.013669490814209, + "eval_runtime": 80.9952, + "eval_samples_per_second": 61.732, + "eval_steps_per_second": 15.433, + "num_input_tokens_seen": 56876128320, + "step": 108500 + }, + { + "epoch": 1.0355748476573214, + "grad_norm": 0.15740585327148438, + "learning_rate": 0.001, + "loss": 2.0945, + "num_input_tokens_seen": 56902334400, + "step": 108550 + }, + { + "epoch": 1.0360518501735096, + "grad_norm": 0.1584336757659912, + "learning_rate": 0.001, + "loss": 2.0828, + "num_input_tokens_seen": 56928547616, + "step": 108600 + }, + { + "epoch": 1.0365288526896979, + "grad_norm": 0.1397266536951065, + "learning_rate": 0.001, + "loss": 2.097, + "num_input_tokens_seen": 56954753664, + "step": 108650 + }, + { + "epoch": 1.0370058552058863, + "grad_norm": 0.14098243415355682, + "learning_rate": 0.001, + "loss": 2.084, + "num_input_tokens_seen": 56980968064, + "step": 108700 + }, + { + "epoch": 1.0374828577220745, + "grad_norm": 0.14463113248348236, + "learning_rate": 0.001, + "loss": 2.0938, + "num_input_tokens_seen": 57007174656, + "step": 108750 + }, + { + "epoch": 1.0379598602382627, + "grad_norm": 0.15365611016750336, + "learning_rate": 0.001, + "loss": 2.0972, + "num_input_tokens_seen": 57033388672, + "step": 108800 + }, + { + "epoch": 1.0384368627544511, + "grad_norm": 0.13319768011569977, + "learning_rate": 0.001, + "loss": 2.095, + "num_input_tokens_seen": 57059598624, + "step": 108850 + }, + { + "epoch": 1.0389138652706393, + "grad_norm": 0.15738600492477417, + "learning_rate": 0.001, + "loss": 2.1002, + "num_input_tokens_seen": 57085810016, + "step": 108900 + }, + { + "epoch": 1.0393908677868275, + "grad_norm": 0.14639179408550262, + "learning_rate": 0.001, + "loss": 2.0989, + "num_input_tokens_seen": 57112024416, + "step": 108950 + }, + { + "epoch": 1.0398678703030158, + "grad_norm": 0.1358439177274704, + "learning_rate": 0.001, + "loss": 2.099, + "num_input_tokens_seen": 57138237248, + "step": 109000 + }, + { + "epoch": 1.0398678703030158, + "eval_loss": 2.0128679275512695, + "eval_runtime": 82.3317, + "eval_samples_per_second": 60.73, + "eval_steps_per_second": 15.182, + "num_input_tokens_seen": 57138237248, + "step": 109000 + }, + { + "epoch": 1.0403448728192042, + "grad_norm": 0.14199453592300415, + "learning_rate": 0.001, + "loss": 2.1023, + "num_input_tokens_seen": 57164448992, + "step": 109050 + }, + { + "epoch": 1.0408218753353924, + "grad_norm": 0.1462697833776474, + "learning_rate": 0.001, + "loss": 2.0933, + "num_input_tokens_seen": 57190658656, + "step": 109100 + }, + { + "epoch": 1.0412988778515806, + "grad_norm": 0.14854200184345245, + "learning_rate": 0.001, + "loss": 2.1052, + "num_input_tokens_seen": 57216873056, + "step": 109150 + }, + { + "epoch": 1.041775880367769, + "grad_norm": 0.140263170003891, + "learning_rate": 0.001, + "loss": 2.0874, + "num_input_tokens_seen": 57243086400, + "step": 109200 + }, + { + "epoch": 1.0422528828839572, + "grad_norm": 0.1429862082004547, + "learning_rate": 0.001, + "loss": 2.0981, + "num_input_tokens_seen": 57269288928, + "step": 109250 + }, + { + "epoch": 1.0427298854001454, + "grad_norm": 0.1370985209941864, + "learning_rate": 0.001, + "loss": 2.0779, + "num_input_tokens_seen": 57295496128, + "step": 109300 + }, + { + "epoch": 1.0432068879163339, + "grad_norm": 0.15176068246364594, + "learning_rate": 0.001, + "loss": 2.0953, + "num_input_tokens_seen": 57321705568, + "step": 109350 + }, + { + "epoch": 1.043683890432522, + "grad_norm": 0.13600246608257294, + "learning_rate": 0.001, + "loss": 2.1109, + "num_input_tokens_seen": 57347914880, + "step": 109400 + }, + { + "epoch": 1.0441608929487103, + "grad_norm": 0.15201528370380402, + "learning_rate": 0.001, + "loss": 2.0852, + "num_input_tokens_seen": 57374122464, + "step": 109450 + }, + { + "epoch": 1.0446378954648985, + "grad_norm": 0.13787305355072021, + "learning_rate": 0.001, + "loss": 2.0952, + "num_input_tokens_seen": 57400333280, + "step": 109500 + }, + { + "epoch": 1.0446378954648985, + "eval_loss": 2.012360095977783, + "eval_runtime": 83.0266, + "eval_samples_per_second": 60.222, + "eval_steps_per_second": 15.055, + "num_input_tokens_seen": 57400333280, + "step": 109500 + }, + { + "epoch": 1.045114897981087, + "grad_norm": 0.13901057839393616, + "learning_rate": 0.001, + "loss": 2.0954, + "num_input_tokens_seen": 57426541536, + "step": 109550 + }, + { + "epoch": 1.0455919004972751, + "grad_norm": 0.13901159167289734, + "learning_rate": 0.001, + "loss": 2.1016, + "num_input_tokens_seen": 57452749952, + "step": 109600 + }, + { + "epoch": 1.0460689030134633, + "grad_norm": 0.14035074412822723, + "learning_rate": 0.001, + "loss": 2.1006, + "num_input_tokens_seen": 57478964352, + "step": 109650 + }, + { + "epoch": 1.0465459055296518, + "grad_norm": 0.13569940626621246, + "learning_rate": 0.001, + "loss": 2.0895, + "num_input_tokens_seen": 57505178752, + "step": 109700 + }, + { + "epoch": 1.04702290804584, + "grad_norm": 0.15281043946743011, + "learning_rate": 0.001, + "loss": 2.0948, + "num_input_tokens_seen": 57531385376, + "step": 109750 + }, + { + "epoch": 1.0474999105620282, + "grad_norm": 0.154220312833786, + "learning_rate": 0.001, + "loss": 2.0957, + "num_input_tokens_seen": 57557599776, + "step": 109800 + }, + { + "epoch": 1.0479769130782164, + "grad_norm": 0.1422448307275772, + "learning_rate": 0.001, + "loss": 2.0968, + "num_input_tokens_seen": 57583807360, + "step": 109850 + }, + { + "epoch": 1.0484539155944048, + "grad_norm": 0.1550014317035675, + "learning_rate": 0.001, + "loss": 2.0924, + "num_input_tokens_seen": 57610015680, + "step": 109900 + }, + { + "epoch": 1.048930918110593, + "grad_norm": 0.1353992372751236, + "learning_rate": 0.001, + "loss": 2.098, + "num_input_tokens_seen": 57636230080, + "step": 109950 + }, + { + "epoch": 1.0494079206267812, + "grad_norm": 0.15528377890586853, + "learning_rate": 0.001, + "loss": 2.0895, + "num_input_tokens_seen": 57662439520, + "step": 110000 + }, + { + "epoch": 1.0494079206267812, + "eval_loss": 2.013206958770752, + "eval_runtime": 83.3169, + "eval_samples_per_second": 60.012, + "eval_steps_per_second": 15.003, + "num_input_tokens_seen": 57662439520, + "step": 110000 + }, + { + "epoch": 1.0498849231429697, + "grad_norm": 0.14630495011806488, + "learning_rate": 0.001, + "loss": 2.104, + "num_input_tokens_seen": 57688651552, + "step": 110050 + }, + { + "epoch": 1.0503619256591579, + "grad_norm": 0.15211914479732513, + "learning_rate": 0.001, + "loss": 2.0929, + "num_input_tokens_seen": 57714851968, + "step": 110100 + }, + { + "epoch": 1.050838928175346, + "grad_norm": 0.15288826823234558, + "learning_rate": 0.001, + "loss": 2.107, + "num_input_tokens_seen": 57741066368, + "step": 110150 + }, + { + "epoch": 1.0513159306915343, + "grad_norm": 0.18201424181461334, + "learning_rate": 0.001, + "loss": 2.0831, + "num_input_tokens_seen": 57767277728, + "step": 110200 + }, + { + "epoch": 1.0517929332077227, + "grad_norm": 0.15570518374443054, + "learning_rate": 0.001, + "loss": 2.1017, + "num_input_tokens_seen": 57793489344, + "step": 110250 + }, + { + "epoch": 1.052269935723911, + "grad_norm": 0.15590184926986694, + "learning_rate": 0.001, + "loss": 2.0851, + "num_input_tokens_seen": 57819702752, + "step": 110300 + }, + { + "epoch": 1.0527469382400991, + "grad_norm": 0.13825973868370056, + "learning_rate": 0.001, + "loss": 2.0966, + "num_input_tokens_seen": 57845893728, + "step": 110350 + }, + { + "epoch": 1.0532239407562876, + "grad_norm": 0.1424342691898346, + "learning_rate": 0.001, + "loss": 2.0915, + "num_input_tokens_seen": 57872104704, + "step": 110400 + }, + { + "epoch": 1.0537009432724758, + "grad_norm": 0.14073631167411804, + "learning_rate": 0.001, + "loss": 2.1029, + "num_input_tokens_seen": 57898312928, + "step": 110450 + }, + { + "epoch": 1.054177945788664, + "grad_norm": 0.1437380015850067, + "learning_rate": 0.001, + "loss": 2.0945, + "num_input_tokens_seen": 57924525696, + "step": 110500 + }, + { + "epoch": 1.054177945788664, + "eval_loss": 2.01132869720459, + "eval_runtime": 83.1173, + "eval_samples_per_second": 60.156, + "eval_steps_per_second": 15.039, + "num_input_tokens_seen": 57924525696, + "step": 110500 + }, + { + "epoch": 1.0546549483048524, + "grad_norm": 0.14192169904708862, + "learning_rate": 0.001, + "loss": 2.0819, + "num_input_tokens_seen": 57950733824, + "step": 110550 + }, + { + "epoch": 1.0551319508210406, + "grad_norm": 0.13621965050697327, + "learning_rate": 0.001, + "loss": 2.096, + "num_input_tokens_seen": 57976940256, + "step": 110600 + }, + { + "epoch": 1.0556089533372288, + "grad_norm": 0.14394508302211761, + "learning_rate": 0.001, + "loss": 2.0959, + "num_input_tokens_seen": 58003150272, + "step": 110650 + }, + { + "epoch": 1.056085955853417, + "grad_norm": 0.15062682330608368, + "learning_rate": 0.001, + "loss": 2.0912, + "num_input_tokens_seen": 58029359264, + "step": 110700 + }, + { + "epoch": 1.0565629583696055, + "grad_norm": 0.15145541727542877, + "learning_rate": 0.001, + "loss": 2.0954, + "num_input_tokens_seen": 58055566272, + "step": 110750 + }, + { + "epoch": 1.0570399608857937, + "grad_norm": 0.14316266775131226, + "learning_rate": 0.001, + "loss": 2.0871, + "num_input_tokens_seen": 58081774848, + "step": 110800 + }, + { + "epoch": 1.0575169634019819, + "grad_norm": 0.1519429087638855, + "learning_rate": 0.001, + "loss": 2.1033, + "num_input_tokens_seen": 58107987648, + "step": 110850 + }, + { + "epoch": 1.0579939659181703, + "grad_norm": 0.1556522697210312, + "learning_rate": 0.001, + "loss": 2.0921, + "num_input_tokens_seen": 58134199072, + "step": 110900 + }, + { + "epoch": 1.0584709684343585, + "grad_norm": 0.14416266977787018, + "learning_rate": 0.001, + "loss": 2.0952, + "num_input_tokens_seen": 58160410880, + "step": 110950 + }, + { + "epoch": 1.0589479709505467, + "grad_norm": 0.14932425320148468, + "learning_rate": 0.001, + "loss": 2.0978, + "num_input_tokens_seen": 58186625280, + "step": 111000 + }, + { + "epoch": 1.0589479709505467, + "eval_loss": 2.010718584060669, + "eval_runtime": 83.9004, + "eval_samples_per_second": 59.594, + "eval_steps_per_second": 14.899, + "num_input_tokens_seen": 58186625280, + "step": 111000 + }, + { + "epoch": 1.0594249734667351, + "grad_norm": 0.14156724512577057, + "learning_rate": 0.001, + "loss": 2.1053, + "num_input_tokens_seen": 58212829344, + "step": 111050 + }, + { + "epoch": 1.0599019759829234, + "grad_norm": 0.13802699744701385, + "learning_rate": 0.001, + "loss": 2.0977, + "num_input_tokens_seen": 58239042144, + "step": 111100 + }, + { + "epoch": 1.0603789784991116, + "grad_norm": 0.1430686116218567, + "learning_rate": 0.001, + "loss": 2.085, + "num_input_tokens_seen": 58265254688, + "step": 111150 + }, + { + "epoch": 1.0608559810152998, + "grad_norm": 0.1441573053598404, + "learning_rate": 0.001, + "loss": 2.0912, + "num_input_tokens_seen": 58291469088, + "step": 111200 + }, + { + "epoch": 1.0613329835314882, + "grad_norm": 0.14418621361255646, + "learning_rate": 0.001, + "loss": 2.0902, + "num_input_tokens_seen": 58317683424, + "step": 111250 + }, + { + "epoch": 1.0618099860476764, + "grad_norm": 0.140812486410141, + "learning_rate": 0.001, + "loss": 2.097, + "num_input_tokens_seen": 58343897024, + "step": 111300 + }, + { + "epoch": 1.0622869885638646, + "grad_norm": 0.13597142696380615, + "learning_rate": 0.001, + "loss": 2.0914, + "num_input_tokens_seen": 58370104576, + "step": 111350 + }, + { + "epoch": 1.062763991080053, + "grad_norm": 0.14179456233978271, + "learning_rate": 0.001, + "loss": 2.0888, + "num_input_tokens_seen": 58396317792, + "step": 111400 + }, + { + "epoch": 1.0632409935962412, + "grad_norm": 0.15151284635066986, + "learning_rate": 0.001, + "loss": 2.0825, + "num_input_tokens_seen": 58422525344, + "step": 111450 + }, + { + "epoch": 1.0637179961124295, + "grad_norm": 0.15292806923389435, + "learning_rate": 0.001, + "loss": 2.0873, + "num_input_tokens_seen": 58448739744, + "step": 111500 + }, + { + "epoch": 1.0637179961124295, + "eval_loss": 2.010636329650879, + "eval_runtime": 82.455, + "eval_samples_per_second": 60.639, + "eval_steps_per_second": 15.16, + "num_input_tokens_seen": 58448739744, + "step": 111500 + }, + { + "epoch": 1.0641949986286177, + "grad_norm": 0.1312059611082077, + "learning_rate": 0.001, + "loss": 2.0916, + "num_input_tokens_seen": 58474952288, + "step": 111550 + }, + { + "epoch": 1.064672001144806, + "grad_norm": 0.14581365883350372, + "learning_rate": 0.001, + "loss": 2.08, + "num_input_tokens_seen": 58501161280, + "step": 111600 + }, + { + "epoch": 1.0651490036609943, + "grad_norm": 0.1432618349790573, + "learning_rate": 0.001, + "loss": 2.0919, + "num_input_tokens_seen": 58527355744, + "step": 111650 + }, + { + "epoch": 1.0656260061771825, + "grad_norm": 0.14399270713329315, + "learning_rate": 0.001, + "loss": 2.095, + "num_input_tokens_seen": 58553570144, + "step": 111700 + }, + { + "epoch": 1.066103008693371, + "grad_norm": 0.14912860095500946, + "learning_rate": 0.001, + "loss": 2.0856, + "num_input_tokens_seen": 58579784544, + "step": 111750 + }, + { + "epoch": 1.0665800112095591, + "grad_norm": 0.13581617176532745, + "learning_rate": 0.001, + "loss": 2.0919, + "num_input_tokens_seen": 58605992992, + "step": 111800 + }, + { + "epoch": 1.0670570137257473, + "grad_norm": 0.1407386064529419, + "learning_rate": 0.001, + "loss": 2.0906, + "num_input_tokens_seen": 58632200352, + "step": 111850 + }, + { + "epoch": 1.0675340162419358, + "grad_norm": 0.15018606185913086, + "learning_rate": 0.001, + "loss": 2.0932, + "num_input_tokens_seen": 58658411040, + "step": 111900 + }, + { + "epoch": 1.068011018758124, + "grad_norm": 0.15312473475933075, + "learning_rate": 0.001, + "loss": 2.0941, + "num_input_tokens_seen": 58684621696, + "step": 111950 + }, + { + "epoch": 1.0684880212743122, + "grad_norm": 0.155229389667511, + "learning_rate": 0.001, + "loss": 2.0768, + "num_input_tokens_seen": 58710836096, + "step": 112000 + }, + { + "epoch": 1.0684880212743122, + "eval_loss": 2.0109803676605225, + "eval_runtime": 82.3106, + "eval_samples_per_second": 60.745, + "eval_steps_per_second": 15.186, + "num_input_tokens_seen": 58710836096, + "step": 112000 + }, + { + "epoch": 1.0689650237905004, + "grad_norm": 0.13854993879795074, + "learning_rate": 0.0009999921320324326, + "loss": 2.0951, + "num_input_tokens_seen": 58737048768, + "step": 112050 + }, + { + "epoch": 1.0694420263066888, + "grad_norm": 0.15185818076133728, + "learning_rate": 0.0009999685283773503, + "loss": 2.0978, + "num_input_tokens_seen": 58763257760, + "step": 112100 + }, + { + "epoch": 1.069919028822877, + "grad_norm": 0.13926972448825836, + "learning_rate": 0.000999929189777604, + "loss": 2.0967, + "num_input_tokens_seen": 58789471168, + "step": 112150 + }, + { + "epoch": 1.0703960313390652, + "grad_norm": 0.1538979411125183, + "learning_rate": 0.0009998741174712534, + "loss": 2.0981, + "num_input_tokens_seen": 58815685568, + "step": 112200 + }, + { + "epoch": 1.0708730338552537, + "grad_norm": 0.16998735070228577, + "learning_rate": 0.0009998033131915266, + "loss": 2.092, + "num_input_tokens_seen": 58841890880, + "step": 112250 + }, + { + "epoch": 1.0713500363714419, + "grad_norm": 0.13714225590229034, + "learning_rate": 0.0009997167791667668, + "loss": 2.0834, + "num_input_tokens_seen": 58868091424, + "step": 112300 + }, + { + "epoch": 1.07182703888763, + "grad_norm": 0.1447121649980545, + "learning_rate": 0.0009996145181203615, + "loss": 2.1021, + "num_input_tokens_seen": 58894304800, + "step": 112350 + }, + { + "epoch": 1.0723040414038185, + "grad_norm": 0.14412052929401398, + "learning_rate": 0.0009994965332706573, + "loss": 2.0748, + "num_input_tokens_seen": 58920503936, + "step": 112400 + }, + { + "epoch": 1.0727810439200067, + "grad_norm": 0.14241209626197815, + "learning_rate": 0.000999362828330858, + "loss": 2.0932, + "num_input_tokens_seen": 58946705920, + "step": 112450 + }, + { + "epoch": 1.073258046436195, + "grad_norm": 0.13844363391399384, + "learning_rate": 0.0009992134075089082, + "loss": 2.0901, + "num_input_tokens_seen": 58972918688, + "step": 112500 + }, + { + "epoch": 1.073258046436195, + "eval_loss": 2.0120961666107178, + "eval_runtime": 82.9684, + "eval_samples_per_second": 60.264, + "eval_steps_per_second": 15.066, + "num_input_tokens_seen": 58972918688, + "step": 112500 + }, + { + "epoch": 1.0737350489523831, + "grad_norm": 0.14107921719551086, + "learning_rate": 0.0009990482755073606, + "loss": 2.0982, + "num_input_tokens_seen": 58999132320, + "step": 112550 + }, + { + "epoch": 1.0742120514685716, + "grad_norm": 0.1476968675851822, + "learning_rate": 0.000998867437523228, + "loss": 2.0945, + "num_input_tokens_seen": 59025346720, + "step": 112600 + }, + { + "epoch": 1.0746890539847598, + "grad_norm": 0.14322301745414734, + "learning_rate": 0.0009986708992478202, + "loss": 2.0886, + "num_input_tokens_seen": 59051559072, + "step": 112650 + }, + { + "epoch": 1.075166056500948, + "grad_norm": 0.14406149089336395, + "learning_rate": 0.000998458666866564, + "loss": 2.0951, + "num_input_tokens_seen": 59077772416, + "step": 112700 + }, + { + "epoch": 1.0756430590171364, + "grad_norm": 0.14645279943943024, + "learning_rate": 0.0009982307470588097, + "loss": 2.0924, + "num_input_tokens_seen": 59103984384, + "step": 112750 + }, + { + "epoch": 1.0761200615333246, + "grad_norm": 0.14315037429332733, + "learning_rate": 0.0009979871469976197, + "loss": 2.0866, + "num_input_tokens_seen": 59130197568, + "step": 112800 + }, + { + "epoch": 1.0765970640495128, + "grad_norm": 0.15335896611213684, + "learning_rate": 0.0009977278743495434, + "loss": 2.0834, + "num_input_tokens_seen": 59156411840, + "step": 112850 + }, + { + "epoch": 1.077074066565701, + "grad_norm": 0.14293555915355682, + "learning_rate": 0.0009974529372743762, + "loss": 2.1005, + "num_input_tokens_seen": 59182626240, + "step": 112900 + }, + { + "epoch": 1.0775510690818895, + "grad_norm": 0.14281043410301208, + "learning_rate": 0.000997162344424902, + "loss": 2.1049, + "num_input_tokens_seen": 59208840640, + "step": 112950 + }, + { + "epoch": 1.0780280715980777, + "grad_norm": 0.12666131556034088, + "learning_rate": 0.0009968561049466214, + "loss": 2.0889, + "num_input_tokens_seen": 59235047232, + "step": 113000 + }, + { + "epoch": 1.0780280715980777, + "eval_loss": 2.0116124153137207, + "eval_runtime": 109.4574, + "eval_samples_per_second": 45.68, + "eval_steps_per_second": 11.42, + "num_input_tokens_seen": 59235047232, + "step": 113000 + }, + { + "epoch": 1.0785050741142659, + "grad_norm": 0.14696183800697327, + "learning_rate": 0.0009965342284774632, + "loss": 2.084, + "num_input_tokens_seen": 59261256096, + "step": 113050 + }, + { + "epoch": 1.0789820766304543, + "grad_norm": 0.1535506546497345, + "learning_rate": 0.0009961967251474822, + "loss": 2.0905, + "num_input_tokens_seen": 59287464384, + "step": 113100 + }, + { + "epoch": 1.0794590791466425, + "grad_norm": 0.14321501553058624, + "learning_rate": 0.000995843605578539, + "loss": 2.0971, + "num_input_tokens_seen": 59313669856, + "step": 113150 + }, + { + "epoch": 1.0799360816628307, + "grad_norm": 0.15687337517738342, + "learning_rate": 0.0009954748808839674, + "loss": 2.0864, + "num_input_tokens_seen": 59339879328, + "step": 113200 + }, + { + "epoch": 1.080413084179019, + "grad_norm": 0.16271081566810608, + "learning_rate": 0.000995090562668223, + "loss": 2.0948, + "num_input_tokens_seen": 59366089088, + "step": 113250 + }, + { + "epoch": 1.0808900866952074, + "grad_norm": 0.14683839678764343, + "learning_rate": 0.0009946906630265184, + "loss": 2.105, + "num_input_tokens_seen": 59392300448, + "step": 113300 + }, + { + "epoch": 1.0813670892113956, + "grad_norm": 0.15148819983005524, + "learning_rate": 0.0009942751945444437, + "loss": 2.0814, + "num_input_tokens_seen": 59418514560, + "step": 113350 + }, + { + "epoch": 1.0818440917275838, + "grad_norm": 0.14587359130382538, + "learning_rate": 0.0009938441702975688, + "loss": 2.0943, + "num_input_tokens_seen": 59444719360, + "step": 113400 + }, + { + "epoch": 1.0823210942437722, + "grad_norm": 0.14699944853782654, + "learning_rate": 0.0009933976038510332, + "loss": 2.0927, + "num_input_tokens_seen": 59470933600, + "step": 113450 + }, + { + "epoch": 1.0827980967599604, + "grad_norm": 0.14229649305343628, + "learning_rate": 0.0009929355092591179, + "loss": 2.0985, + "num_input_tokens_seen": 59497148000, + "step": 113500 + }, + { + "epoch": 1.0827980967599604, + "eval_loss": 2.009983539581299, + "eval_runtime": 82.6823, + "eval_samples_per_second": 60.472, + "eval_steps_per_second": 15.118, + "num_input_tokens_seen": 59497148000, + "step": 113500 + }, + { + "epoch": 1.0832750992761486, + "grad_norm": 0.14160077273845673, + "learning_rate": 0.0009924579010648041, + "loss": 2.0935, + "num_input_tokens_seen": 59523359584, + "step": 113550 + }, + { + "epoch": 1.083752101792337, + "grad_norm": 0.1411445587873459, + "learning_rate": 0.0009919647942993148, + "loss": 2.093, + "num_input_tokens_seen": 59549569568, + "step": 113600 + }, + { + "epoch": 1.0842291043085253, + "grad_norm": 0.13501347601413727, + "learning_rate": 0.0009914562044816423, + "loss": 2.0919, + "num_input_tokens_seen": 59575783200, + "step": 113650 + }, + { + "epoch": 1.0847061068247135, + "grad_norm": 0.14355099201202393, + "learning_rate": 0.0009909321476180592, + "loss": 2.0913, + "num_input_tokens_seen": 59601990304, + "step": 113700 + }, + { + "epoch": 1.0851831093409017, + "grad_norm": 0.13246339559555054, + "learning_rate": 0.0009903926402016153, + "loss": 2.0803, + "num_input_tokens_seen": 59628197120, + "step": 113750 + }, + { + "epoch": 1.08566011185709, + "grad_norm": 0.13418996334075928, + "learning_rate": 0.0009898376992116178, + "loss": 2.1042, + "num_input_tokens_seen": 59654409856, + "step": 113800 + }, + { + "epoch": 1.0861371143732783, + "grad_norm": 0.15235918760299683, + "learning_rate": 0.0009892673421130977, + "loss": 2.0987, + "num_input_tokens_seen": 59680620096, + "step": 113850 + }, + { + "epoch": 1.0866141168894665, + "grad_norm": 0.1395738422870636, + "learning_rate": 0.0009886815868562597, + "loss": 2.0932, + "num_input_tokens_seen": 59706827264, + "step": 113900 + }, + { + "epoch": 1.087091119405655, + "grad_norm": 0.1433008313179016, + "learning_rate": 0.000988080451875917, + "loss": 2.0943, + "num_input_tokens_seen": 59733034688, + "step": 113950 + }, + { + "epoch": 1.0875681219218432, + "grad_norm": 0.14490137994289398, + "learning_rate": 0.0009874639560909118, + "loss": 2.1012, + "num_input_tokens_seen": 59759249088, + "step": 114000 + }, + { + "epoch": 1.0875681219218432, + "eval_loss": 2.0104737281799316, + "eval_runtime": 82.5956, + "eval_samples_per_second": 60.536, + "eval_steps_per_second": 15.134, + "num_input_tokens_seen": 59759249088, + "step": 114000 + }, + { + "epoch": 1.0880451244380314, + "grad_norm": 0.1358513981103897, + "learning_rate": 0.0009868321189035196, + "loss": 2.1057, + "num_input_tokens_seen": 59785457920, + "step": 114050 + }, + { + "epoch": 1.0885221269542198, + "grad_norm": 0.14738275110721588, + "learning_rate": 0.0009861849601988384, + "loss": 2.099, + "num_input_tokens_seen": 59811672288, + "step": 114100 + }, + { + "epoch": 1.088999129470408, + "grad_norm": 0.16324234008789062, + "learning_rate": 0.0009855225003441628, + "loss": 2.0952, + "num_input_tokens_seen": 59837885600, + "step": 114150 + }, + { + "epoch": 1.0894761319865962, + "grad_norm": 0.15156808495521545, + "learning_rate": 0.0009848447601883434, + "loss": 2.1014, + "num_input_tokens_seen": 59864099392, + "step": 114200 + }, + { + "epoch": 1.0899531345027844, + "grad_norm": 0.14273667335510254, + "learning_rate": 0.0009841517610611307, + "loss": 2.0898, + "num_input_tokens_seen": 59890311072, + "step": 114250 + }, + { + "epoch": 1.0904301370189728, + "grad_norm": 0.1409289538860321, + "learning_rate": 0.0009834435247725033, + "loss": 2.0798, + "num_input_tokens_seen": 59916523776, + "step": 114300 + }, + { + "epoch": 1.090907139535161, + "grad_norm": 0.13659177720546722, + "learning_rate": 0.0009827200736119814, + "loss": 2.084, + "num_input_tokens_seen": 59942727744, + "step": 114350 + }, + { + "epoch": 1.0913841420513493, + "grad_norm": 0.14861910045146942, + "learning_rate": 0.0009819814303479266, + "loss": 2.1021, + "num_input_tokens_seen": 59968942144, + "step": 114400 + }, + { + "epoch": 1.0918611445675377, + "grad_norm": 0.13872170448303223, + "learning_rate": 0.0009812276182268236, + "loss": 2.1001, + "num_input_tokens_seen": 59995154848, + "step": 114450 + }, + { + "epoch": 1.092338147083726, + "grad_norm": 0.14306657016277313, + "learning_rate": 0.00098045866097255, + "loss": 2.0837, + "num_input_tokens_seen": 60021363392, + "step": 114500 + }, + { + "epoch": 1.092338147083726, + "eval_loss": 2.0082569122314453, + "eval_runtime": 82.8417, + "eval_samples_per_second": 60.356, + "eval_steps_per_second": 15.089, + "num_input_tokens_seen": 60021363392, + "step": 114500 + }, + { + "epoch": 1.092815149599914, + "grad_norm": 0.1300678551197052, + "learning_rate": 0.000979674582785628, + "loss": 2.0904, + "num_input_tokens_seen": 60047570880, + "step": 114550 + }, + { + "epoch": 1.0932921521161023, + "grad_norm": 0.1488349586725235, + "learning_rate": 0.0009788754083424652, + "loss": 2.0969, + "num_input_tokens_seen": 60073778944, + "step": 114600 + }, + { + "epoch": 1.0937691546322907, + "grad_norm": 0.14389395713806152, + "learning_rate": 0.000978061162794576, + "loss": 2.0956, + "num_input_tokens_seen": 60099993344, + "step": 114650 + }, + { + "epoch": 1.094246157148479, + "grad_norm": 0.13556672632694244, + "learning_rate": 0.0009772318717677904, + "loss": 2.0856, + "num_input_tokens_seen": 60126204832, + "step": 114700 + }, + { + "epoch": 1.0947231596646672, + "grad_norm": 0.14573290944099426, + "learning_rate": 0.0009763875613614481, + "loss": 2.083, + "num_input_tokens_seen": 60152411456, + "step": 114750 + }, + { + "epoch": 1.0952001621808556, + "grad_norm": 0.14349648356437683, + "learning_rate": 0.0009755282581475768, + "loss": 2.099, + "num_input_tokens_seen": 60178616832, + "step": 114800 + }, + { + "epoch": 1.0956771646970438, + "grad_norm": 0.1363336592912674, + "learning_rate": 0.0009746539891700557, + "loss": 2.0941, + "num_input_tokens_seen": 60204821568, + "step": 114850 + }, + { + "epoch": 1.096154167213232, + "grad_norm": 0.14463187754154205, + "learning_rate": 0.0009737647819437645, + "loss": 2.0987, + "num_input_tokens_seen": 60231035968, + "step": 114900 + }, + { + "epoch": 1.0966311697294202, + "grad_norm": 0.14132525026798248, + "learning_rate": 0.0009728606644537177, + "loss": 2.0954, + "num_input_tokens_seen": 60257250368, + "step": 114950 + }, + { + "epoch": 1.0971081722456086, + "grad_norm": 0.14640025794506073, + "learning_rate": 0.0009719416651541838, + "loss": 2.0992, + "num_input_tokens_seen": 60283464768, + "step": 115000 + }, + { + "epoch": 1.0971081722456086, + "eval_loss": 2.007655620574951, + "eval_runtime": 82.4937, + "eval_samples_per_second": 60.611, + "eval_steps_per_second": 15.153, + "num_input_tokens_seen": 60283464768, + "step": 115000 + }, + { + "epoch": 1.0975851747617968, + "grad_norm": 0.14440514147281647, + "learning_rate": 0.0009710078129677895, + "loss": 2.0927, + "num_input_tokens_seen": 60309676352, + "step": 115050 + }, + { + "epoch": 1.098062177277985, + "grad_norm": 0.13419468700885773, + "learning_rate": 0.0009700591372846095, + "loss": 2.0871, + "num_input_tokens_seen": 60335889280, + "step": 115100 + }, + { + "epoch": 1.0985391797941735, + "grad_norm": 0.14434845745563507, + "learning_rate": 0.0009690956679612422, + "loss": 2.0823, + "num_input_tokens_seen": 60362096256, + "step": 115150 + }, + { + "epoch": 1.0990161823103617, + "grad_norm": 0.14158272743225098, + "learning_rate": 0.0009681174353198686, + "loss": 2.0932, + "num_input_tokens_seen": 60388308192, + "step": 115200 + }, + { + "epoch": 1.09949318482655, + "grad_norm": 0.1499590128660202, + "learning_rate": 0.0009671244701472999, + "loss": 2.0901, + "num_input_tokens_seen": 60414516160, + "step": 115250 + }, + { + "epoch": 1.0999701873427383, + "grad_norm": 0.13877320289611816, + "learning_rate": 0.0009661168036940071, + "loss": 2.0915, + "num_input_tokens_seen": 60440722624, + "step": 115300 + }, + { + "epoch": 1.1004471898589265, + "grad_norm": 0.14336808025836945, + "learning_rate": 0.0009650944676731382, + "loss": 2.0846, + "num_input_tokens_seen": 60466923616, + "step": 115350 + }, + { + "epoch": 1.1009241923751147, + "grad_norm": 0.16042272746562958, + "learning_rate": 0.0009640574942595195, + "loss": 2.0942, + "num_input_tokens_seen": 60493123456, + "step": 115400 + }, + { + "epoch": 1.101401194891303, + "grad_norm": 0.14399364590644836, + "learning_rate": 0.0009630059160886439, + "loss": 2.0988, + "num_input_tokens_seen": 60519323040, + "step": 115450 + }, + { + "epoch": 1.1018781974074914, + "grad_norm": 0.14042776823043823, + "learning_rate": 0.0009619397662556434, + "loss": 2.0916, + "num_input_tokens_seen": 60545534656, + "step": 115500 + }, + { + "epoch": 1.1018781974074914, + "eval_loss": 2.0105109214782715, + "eval_runtime": 82.3145, + "eval_samples_per_second": 60.743, + "eval_steps_per_second": 15.186, + "num_input_tokens_seen": 60545534656, + "step": 115500 + }, + { + "epoch": 1.1023551999236796, + "grad_norm": 0.1399744153022766, + "learning_rate": 0.000960859078314247, + "loss": 2.096, + "num_input_tokens_seen": 60571738272, + "step": 115550 + }, + { + "epoch": 1.1028322024398678, + "grad_norm": 0.14161787927150726, + "learning_rate": 0.0009597638862757254, + "loss": 2.0916, + "num_input_tokens_seen": 60597952672, + "step": 115600 + }, + { + "epoch": 1.1033092049560562, + "grad_norm": 0.14088015258312225, + "learning_rate": 0.0009586542246078203, + "loss": 2.0856, + "num_input_tokens_seen": 60624155648, + "step": 115650 + }, + { + "epoch": 1.1037862074722444, + "grad_norm": 0.13098938763141632, + "learning_rate": 0.00095753012823366, + "loss": 2.0849, + "num_input_tokens_seen": 60650370048, + "step": 115700 + }, + { + "epoch": 1.1042632099884326, + "grad_norm": 0.14463865756988525, + "learning_rate": 0.0009563916325306594, + "loss": 2.0918, + "num_input_tokens_seen": 60676580928, + "step": 115750 + }, + { + "epoch": 1.104740212504621, + "grad_norm": 0.14490677416324615, + "learning_rate": 0.000955238773329408, + "loss": 2.0996, + "num_input_tokens_seen": 60702794432, + "step": 115800 + }, + { + "epoch": 1.1052172150208093, + "grad_norm": 0.14372467994689941, + "learning_rate": 0.0009540715869125407, + "loss": 2.09, + "num_input_tokens_seen": 60729000064, + "step": 115850 + }, + { + "epoch": 1.1056942175369975, + "grad_norm": 0.16468504071235657, + "learning_rate": 0.000952890110013597, + "loss": 2.0901, + "num_input_tokens_seen": 60755212896, + "step": 115900 + }, + { + "epoch": 1.1061712200531857, + "grad_norm": 0.390666663646698, + "learning_rate": 0.0009516943798158648, + "loss": 2.0855, + "num_input_tokens_seen": 60781425984, + "step": 115950 + }, + { + "epoch": 1.1066482225693741, + "grad_norm": 0.14308005571365356, + "learning_rate": 0.0009504844339512095, + "loss": 2.1125, + "num_input_tokens_seen": 60807636160, + "step": 116000 + }, + { + "epoch": 1.1066482225693741, + "eval_loss": 2.0120937824249268, + "eval_runtime": 82.7927, + "eval_samples_per_second": 60.392, + "eval_steps_per_second": 15.098, + "num_input_tokens_seen": 60807636160, + "step": 116000 + }, + { + "epoch": 1.1071252250855623, + "grad_norm": 0.13944968581199646, + "learning_rate": 0.0009492603104988907, + "loss": 2.1028, + "num_input_tokens_seen": 60833850560, + "step": 116050 + }, + { + "epoch": 1.1076022276017505, + "grad_norm": 0.14454355835914612, + "learning_rate": 0.0009480220479843627, + "loss": 2.0995, + "num_input_tokens_seen": 60860064224, + "step": 116100 + }, + { + "epoch": 1.108079230117939, + "grad_norm": 0.1737418919801712, + "learning_rate": 0.0009467696853780625, + "loss": 2.0841, + "num_input_tokens_seen": 60886278080, + "step": 116150 + }, + { + "epoch": 1.1085562326341272, + "grad_norm": 0.1442703902721405, + "learning_rate": 0.0009455032620941839, + "loss": 2.0847, + "num_input_tokens_seen": 60912488608, + "step": 116200 + }, + { + "epoch": 1.1090332351503154, + "grad_norm": 0.14151588082313538, + "learning_rate": 0.0009442228179894363, + "loss": 2.0939, + "num_input_tokens_seen": 60938699264, + "step": 116250 + }, + { + "epoch": 1.1095102376665036, + "grad_norm": 0.12823954224586487, + "learning_rate": 0.00094292839336179, + "loss": 2.0911, + "num_input_tokens_seen": 60964913664, + "step": 116300 + }, + { + "epoch": 1.109987240182692, + "grad_norm": 0.1551038920879364, + "learning_rate": 0.0009416200289492091, + "loss": 2.0905, + "num_input_tokens_seen": 60991126176, + "step": 116350 + }, + { + "epoch": 1.1104642426988802, + "grad_norm": 0.14844666421413422, + "learning_rate": 0.000940297765928369, + "loss": 2.0853, + "num_input_tokens_seen": 61017336640, + "step": 116400 + }, + { + "epoch": 1.1109412452150684, + "grad_norm": 0.14786940813064575, + "learning_rate": 0.0009389616459133597, + "loss": 2.0948, + "num_input_tokens_seen": 61043543488, + "step": 116450 + }, + { + "epoch": 1.1114182477312569, + "grad_norm": 0.1404752880334854, + "learning_rate": 0.0009376117109543769, + "loss": 2.0889, + "num_input_tokens_seen": 61069752768, + "step": 116500 + }, + { + "epoch": 1.1114182477312569, + "eval_loss": 2.007530450820923, + "eval_runtime": 83.3145, + "eval_samples_per_second": 60.014, + "eval_steps_per_second": 15.003, + "num_input_tokens_seen": 61069752768, + "step": 116500 + }, + { + "epoch": 1.111895250247445, + "grad_norm": 0.14887551963329315, + "learning_rate": 0.0009362480035363986, + "loss": 2.0906, + "num_input_tokens_seen": 61095967168, + "step": 116550 + }, + { + "epoch": 1.1123722527636333, + "grad_norm": 0.1436939537525177, + "learning_rate": 0.0009348705665778478, + "loss": 2.0857, + "num_input_tokens_seen": 61122178400, + "step": 116600 + }, + { + "epoch": 1.1128492552798217, + "grad_norm": 0.15015645325183868, + "learning_rate": 0.0009334794434292415, + "loss": 2.0877, + "num_input_tokens_seen": 61148383936, + "step": 116650 + }, + { + "epoch": 1.11332625779601, + "grad_norm": 0.15639320015907288, + "learning_rate": 0.0009320746778718274, + "loss": 2.082, + "num_input_tokens_seen": 61174590560, + "step": 116700 + }, + { + "epoch": 1.1138032603121981, + "grad_norm": 0.1376616209745407, + "learning_rate": 0.0009306563141162046, + "loss": 2.0893, + "num_input_tokens_seen": 61200799104, + "step": 116750 + }, + { + "epoch": 1.1142802628283863, + "grad_norm": 0.13897264003753662, + "learning_rate": 0.000929224396800933, + "loss": 2.0885, + "num_input_tokens_seen": 61227004960, + "step": 116800 + }, + { + "epoch": 1.1147572653445748, + "grad_norm": 0.16240862011909485, + "learning_rate": 0.0009277789709911291, + "loss": 2.0772, + "num_input_tokens_seen": 61253214976, + "step": 116850 + }, + { + "epoch": 1.115234267860763, + "grad_norm": 0.13620969653129578, + "learning_rate": 0.0009263200821770461, + "loss": 2.0815, + "num_input_tokens_seen": 61279425344, + "step": 116900 + }, + { + "epoch": 1.1157112703769512, + "grad_norm": 0.13625779747962952, + "learning_rate": 0.0009248477762726437, + "loss": 2.0834, + "num_input_tokens_seen": 61305623936, + "step": 116950 + }, + { + "epoch": 1.1161882728931396, + "grad_norm": 0.1379876434803009, + "learning_rate": 0.0009233620996141421, + "loss": 2.0879, + "num_input_tokens_seen": 61331831488, + "step": 117000 + }, + { + "epoch": 1.1161882728931396, + "eval_loss": 2.0054421424865723, + "eval_runtime": 82.7611, + "eval_samples_per_second": 60.415, + "eval_steps_per_second": 15.104, + "num_input_tokens_seen": 61331831488, + "step": 117000 + }, + { + "epoch": 1.1166652754093278, + "grad_norm": 0.13141483068466187, + "learning_rate": 0.0009218630989585645, + "loss": 2.0933, + "num_input_tokens_seen": 61358045888, + "step": 117050 + }, + { + "epoch": 1.117142277925516, + "grad_norm": 0.14495305716991425, + "learning_rate": 0.0009203508214822651, + "loss": 2.0864, + "num_input_tokens_seen": 61384257568, + "step": 117100 + }, + { + "epoch": 1.1176192804417044, + "grad_norm": 0.14642465114593506, + "learning_rate": 0.0009188253147794443, + "loss": 2.0918, + "num_input_tokens_seen": 61410471968, + "step": 117150 + }, + { + "epoch": 1.1180962829578927, + "grad_norm": 0.13314634561538696, + "learning_rate": 0.0009172866268606513, + "loss": 2.0896, + "num_input_tokens_seen": 61436668768, + "step": 117200 + }, + { + "epoch": 1.1185732854740809, + "grad_norm": 0.15387175977230072, + "learning_rate": 0.0009157348061512727, + "loss": 2.0771, + "num_input_tokens_seen": 61462881056, + "step": 117250 + }, + { + "epoch": 1.119050287990269, + "grad_norm": 0.13886821269989014, + "learning_rate": 0.0009141699014900082, + "loss": 2.0945, + "num_input_tokens_seen": 61489085536, + "step": 117300 + }, + { + "epoch": 1.1195272905064575, + "grad_norm": 0.13939301669597626, + "learning_rate": 0.0009125919621273348, + "loss": 2.0918, + "num_input_tokens_seen": 61515286016, + "step": 117350 + }, + { + "epoch": 1.1200042930226457, + "grad_norm": 0.1996990144252777, + "learning_rate": 0.0009110010377239551, + "loss": 2.0859, + "num_input_tokens_seen": 61541500416, + "step": 117400 + }, + { + "epoch": 1.120481295538834, + "grad_norm": 0.135545015335083, + "learning_rate": 0.0009093971783492354, + "loss": 2.089, + "num_input_tokens_seen": 61567714816, + "step": 117450 + }, + { + "epoch": 1.1209582980550223, + "grad_norm": 0.1394105702638626, + "learning_rate": 0.0009077804344796301, + "loss": 2.0759, + "num_input_tokens_seen": 61593927520, + "step": 117500 + }, + { + "epoch": 1.1209582980550223, + "eval_loss": 2.003880739212036, + "eval_runtime": 83.0803, + "eval_samples_per_second": 60.183, + "eval_steps_per_second": 15.046, + "num_input_tokens_seen": 61593927520, + "step": 117500 + }, + { + "epoch": 1.1214353005712105, + "grad_norm": 0.1590648591518402, + "learning_rate": 0.0009061508569970925, + "loss": 2.0825, + "num_input_tokens_seen": 61620139072, + "step": 117550 + }, + { + "epoch": 1.1219123030873988, + "grad_norm": 0.13328000903129578, + "learning_rate": 0.0009045084971874737, + "loss": 2.0877, + "num_input_tokens_seen": 61646353472, + "step": 117600 + }, + { + "epoch": 1.122389305603587, + "grad_norm": 0.13834019005298615, + "learning_rate": 0.0009028534067389086, + "loss": 2.0871, + "num_input_tokens_seen": 61672566336, + "step": 117650 + }, + { + "epoch": 1.1228663081197754, + "grad_norm": 0.13156409561634064, + "learning_rate": 0.000901185637740189, + "loss": 2.0906, + "num_input_tokens_seen": 61698777696, + "step": 117700 + }, + { + "epoch": 1.1233433106359636, + "grad_norm": 0.1528773009777069, + "learning_rate": 0.0008995052426791246, + "loss": 2.0731, + "num_input_tokens_seen": 61724974336, + "step": 117750 + }, + { + "epoch": 1.1238203131521518, + "grad_norm": 0.14865480363368988, + "learning_rate": 0.0008978122744408905, + "loss": 2.082, + "num_input_tokens_seen": 61751177792, + "step": 117800 + }, + { + "epoch": 1.1242973156683402, + "grad_norm": 0.14318804442882538, + "learning_rate": 0.0008961067863063638, + "loss": 2.0891, + "num_input_tokens_seen": 61777391648, + "step": 117850 + }, + { + "epoch": 1.1247743181845284, + "grad_norm": 0.14581789076328278, + "learning_rate": 0.0008943888319504456, + "loss": 2.0908, + "num_input_tokens_seen": 61803602176, + "step": 117900 + }, + { + "epoch": 1.1252513207007167, + "grad_norm": 0.14142882823944092, + "learning_rate": 0.0008926584654403724, + "loss": 2.0791, + "num_input_tokens_seen": 61829816576, + "step": 117950 + }, + { + "epoch": 1.1257283232169049, + "grad_norm": 0.15033917129039764, + "learning_rate": 0.000890915741234015, + "loss": 2.0801, + "num_input_tokens_seen": 61856020192, + "step": 118000 + }, + { + "epoch": 1.1257283232169049, + "eval_loss": 2.0019845962524414, + "eval_runtime": 82.7188, + "eval_samples_per_second": 60.446, + "eval_steps_per_second": 15.111, + "num_input_tokens_seen": 61856020192, + "step": 118000 + }, + { + "epoch": 1.1262053257330933, + "grad_norm": 0.15097704529762268, + "learning_rate": 0.0008891607141781631, + "loss": 2.0857, + "num_input_tokens_seen": 61882234592, + "step": 118050 + }, + { + "epoch": 1.1266823282492815, + "grad_norm": 0.1383848935365677, + "learning_rate": 0.0008873934395068005, + "loss": 2.0858, + "num_input_tokens_seen": 61908441120, + "step": 118100 + }, + { + "epoch": 1.1271593307654697, + "grad_norm": 0.14688968658447266, + "learning_rate": 0.0008856139728393666, + "loss": 2.085, + "num_input_tokens_seen": 61934653984, + "step": 118150 + }, + { + "epoch": 1.1276363332816581, + "grad_norm": 0.14446312189102173, + "learning_rate": 0.0008838223701790055, + "loss": 2.0765, + "num_input_tokens_seen": 61960867808, + "step": 118200 + }, + { + "epoch": 1.1281133357978463, + "grad_norm": 0.1389646828174591, + "learning_rate": 0.0008820186879108038, + "loss": 2.0816, + "num_input_tokens_seen": 61987070336, + "step": 118250 + }, + { + "epoch": 1.1285903383140345, + "grad_norm": 0.14348453283309937, + "learning_rate": 0.0008802029828000156, + "loss": 2.0875, + "num_input_tokens_seen": 62013276640, + "step": 118300 + }, + { + "epoch": 1.129067340830223, + "grad_norm": 0.14246419072151184, + "learning_rate": 0.0008783753119902765, + "loss": 2.0828, + "num_input_tokens_seen": 62039490144, + "step": 118350 + }, + { + "epoch": 1.1295443433464112, + "grad_norm": 0.13848936557769775, + "learning_rate": 0.0008765357330018055, + "loss": 2.0895, + "num_input_tokens_seen": 62065704544, + "step": 118400 + }, + { + "epoch": 1.1300213458625994, + "grad_norm": 0.14894653856754303, + "learning_rate": 0.0008746843037295936, + "loss": 2.079, + "num_input_tokens_seen": 62091916704, + "step": 118450 + }, + { + "epoch": 1.1304983483787878, + "grad_norm": 0.1354195922613144, + "learning_rate": 0.0008728210824415827, + "loss": 2.0836, + "num_input_tokens_seen": 62118128864, + "step": 118500 + }, + { + "epoch": 1.1304983483787878, + "eval_loss": 2.004451274871826, + "eval_runtime": 82.4857, + "eval_samples_per_second": 60.617, + "eval_steps_per_second": 15.154, + "num_input_tokens_seen": 62118128864, + "step": 118500 + }, + { + "epoch": 1.130975350894976, + "grad_norm": 0.14576098322868347, + "learning_rate": 0.0008709461277768318, + "loss": 2.0912, + "num_input_tokens_seen": 62144343264, + "step": 118550 + }, + { + "epoch": 1.1314523534111642, + "grad_norm": 0.14351360499858856, + "learning_rate": 0.0008690594987436704, + "loss": 2.0777, + "num_input_tokens_seen": 62170554112, + "step": 118600 + }, + { + "epoch": 1.1319293559273524, + "grad_norm": 0.14756879210472107, + "learning_rate": 0.0008671612547178428, + "loss": 2.0907, + "num_input_tokens_seen": 62196764384, + "step": 118650 + }, + { + "epoch": 1.1324063584435409, + "grad_norm": 0.15026496350765228, + "learning_rate": 0.0008652514554406388, + "loss": 2.0857, + "num_input_tokens_seen": 62222966592, + "step": 118700 + }, + { + "epoch": 1.132883360959729, + "grad_norm": 0.13817134499549866, + "learning_rate": 0.0008633301610170136, + "loss": 2.0851, + "num_input_tokens_seen": 62249176192, + "step": 118750 + }, + { + "epoch": 1.1333603634759173, + "grad_norm": 0.13346219062805176, + "learning_rate": 0.0008613974319136957, + "loss": 2.0856, + "num_input_tokens_seen": 62275388064, + "step": 118800 + }, + { + "epoch": 1.1338373659921057, + "grad_norm": 0.14300605654716492, + "learning_rate": 0.0008594533289572853, + "loss": 2.0835, + "num_input_tokens_seen": 62301602464, + "step": 118850 + }, + { + "epoch": 1.134314368508294, + "grad_norm": 0.13790345191955566, + "learning_rate": 0.0008574979133323377, + "loss": 2.0811, + "num_input_tokens_seen": 62327812128, + "step": 118900 + }, + { + "epoch": 1.1347913710244821, + "grad_norm": 0.1419474184513092, + "learning_rate": 0.0008555312465794402, + "loss": 2.0783, + "num_input_tokens_seen": 62354024288, + "step": 118950 + }, + { + "epoch": 1.1352683735406703, + "grad_norm": 0.15154699981212616, + "learning_rate": 0.0008535533905932737, + "loss": 2.0858, + "num_input_tokens_seen": 62380238112, + "step": 119000 + }, + { + "epoch": 1.1352683735406703, + "eval_loss": 2.0006425380706787, + "eval_runtime": 82.1764, + "eval_samples_per_second": 60.845, + "eval_steps_per_second": 15.211, + "num_input_tokens_seen": 62380238112, + "step": 119000 + }, + { + "epoch": 1.1357453760568588, + "grad_norm": 0.1409357637166977, + "learning_rate": 0.0008515644076206653, + "loss": 2.0885, + "num_input_tokens_seen": 62406448192, + "step": 119050 + }, + { + "epoch": 1.136222378573047, + "grad_norm": 0.15409712493419647, + "learning_rate": 0.0008495643602586287, + "loss": 2.0778, + "num_input_tokens_seen": 62432661632, + "step": 119100 + }, + { + "epoch": 1.1366993810892352, + "grad_norm": 0.1327887326478958, + "learning_rate": 0.0008475533114523955, + "loss": 2.086, + "num_input_tokens_seen": 62458870752, + "step": 119150 + }, + { + "epoch": 1.1371763836054236, + "grad_norm": 0.14051629602909088, + "learning_rate": 0.0008455313244934324, + "loss": 2.0765, + "num_input_tokens_seen": 62485082688, + "step": 119200 + }, + { + "epoch": 1.1376533861216118, + "grad_norm": 0.13998936116695404, + "learning_rate": 0.0008434984630174508, + "loss": 2.0784, + "num_input_tokens_seen": 62511288832, + "step": 119250 + }, + { + "epoch": 1.1381303886378, + "grad_norm": 0.1316358745098114, + "learning_rate": 0.0008414547910024035, + "loss": 2.0839, + "num_input_tokens_seen": 62537499648, + "step": 119300 + }, + { + "epoch": 1.1386073911539882, + "grad_norm": 0.13315369188785553, + "learning_rate": 0.0008394003727664709, + "loss": 2.0793, + "num_input_tokens_seen": 62563710336, + "step": 119350 + }, + { + "epoch": 1.1390843936701767, + "grad_norm": 0.1454961597919464, + "learning_rate": 0.0008373352729660373, + "loss": 2.0814, + "num_input_tokens_seen": 62589918400, + "step": 119400 + }, + { + "epoch": 1.1395613961863649, + "grad_norm": 0.14860859513282776, + "learning_rate": 0.0008352595565936554, + "loss": 2.0885, + "num_input_tokens_seen": 62616130880, + "step": 119450 + }, + { + "epoch": 1.140038398702553, + "grad_norm": 0.13664905726909637, + "learning_rate": 0.000833173288976002, + "loss": 2.0836, + "num_input_tokens_seen": 62642339520, + "step": 119500 + }, + { + "epoch": 1.140038398702553, + "eval_loss": 1.9989631175994873, + "eval_runtime": 83.3074, + "eval_samples_per_second": 60.019, + "eval_steps_per_second": 15.005, + "num_input_tokens_seen": 62642339520, + "step": 119500 + }, + { + "epoch": 1.1405154012187415, + "grad_norm": 0.1337277889251709, + "learning_rate": 0.0008310765357718206, + "loss": 2.0745, + "num_input_tokens_seen": 62668548896, + "step": 119550 + }, + { + "epoch": 1.1409924037349297, + "grad_norm": 0.13231709599494934, + "learning_rate": 0.0008289693629698564, + "loss": 2.0851, + "num_input_tokens_seen": 62694761888, + "step": 119600 + }, + { + "epoch": 1.141469406251118, + "grad_norm": 0.13446244597434998, + "learning_rate": 0.0008268518368867782, + "loss": 2.0737, + "num_input_tokens_seen": 62720974368, + "step": 119650 + }, + { + "epoch": 1.1419464087673061, + "grad_norm": 0.14359907805919647, + "learning_rate": 0.0008247240241650918, + "loss": 2.0772, + "num_input_tokens_seen": 62747188768, + "step": 119700 + }, + { + "epoch": 1.1424234112834946, + "grad_norm": 0.13156485557556152, + "learning_rate": 0.0008225859917710439, + "loss": 2.0791, + "num_input_tokens_seen": 62773395936, + "step": 119750 + }, + { + "epoch": 1.1429004137996828, + "grad_norm": 0.14039525389671326, + "learning_rate": 0.000820437806992512, + "loss": 2.0656, + "num_input_tokens_seen": 62799610336, + "step": 119800 + }, + { + "epoch": 1.143377416315871, + "grad_norm": 0.14653949439525604, + "learning_rate": 0.0008182795374368893, + "loss": 2.0741, + "num_input_tokens_seen": 62825821984, + "step": 119850 + }, + { + "epoch": 1.1438544188320594, + "grad_norm": 0.12294785678386688, + "learning_rate": 0.0008161112510289549, + "loss": 2.0741, + "num_input_tokens_seen": 62852031840, + "step": 119900 + }, + { + "epoch": 1.1443314213482476, + "grad_norm": 0.18639816343784332, + "learning_rate": 0.0008139330160087374, + "loss": 2.1258, + "num_input_tokens_seen": 62878240576, + "step": 119950 + }, + { + "epoch": 1.1448084238644358, + "grad_norm": 0.1320071518421173, + "learning_rate": 0.0008117449009293668, + "loss": 2.0956, + "num_input_tokens_seen": 62904447680, + "step": 120000 + }, + { + "epoch": 1.1448084238644358, + "eval_loss": 2.0032639503479004, + "eval_runtime": 82.8531, + "eval_samples_per_second": 60.348, + "eval_steps_per_second": 15.087, + "num_input_tokens_seen": 62904447680, + "step": 120000 + }, + { + "epoch": 1.1452854263806242, + "grad_norm": 0.15100175142288208, + "learning_rate": 0.0008095469746549171, + "loss": 2.0793, + "num_input_tokens_seen": 62930656352, + "step": 120050 + }, + { + "epoch": 1.1457624288968125, + "grad_norm": 0.14095434546470642, + "learning_rate": 0.0008073393063582386, + "loss": 2.0828, + "num_input_tokens_seen": 62956868576, + "step": 120100 + }, + { + "epoch": 1.1462394314130007, + "grad_norm": 0.15013264119625092, + "learning_rate": 0.0008051219655187818, + "loss": 2.0711, + "num_input_tokens_seen": 62983080544, + "step": 120150 + }, + { + "epoch": 1.146716433929189, + "grad_norm": 0.1443673074245453, + "learning_rate": 0.00080289502192041, + "loss": 2.0764, + "num_input_tokens_seen": 63009276608, + "step": 120200 + }, + { + "epoch": 1.1471934364453773, + "grad_norm": 0.13627703487873077, + "learning_rate": 0.0008006585456492029, + "loss": 2.0805, + "num_input_tokens_seen": 63035488032, + "step": 120250 + }, + { + "epoch": 1.1476704389615655, + "grad_norm": 0.14744721353054047, + "learning_rate": 0.0007984126070912518, + "loss": 2.0691, + "num_input_tokens_seen": 63061701600, + "step": 120300 + }, + { + "epoch": 1.1481474414777537, + "grad_norm": 0.14301970601081848, + "learning_rate": 0.0007961572769304437, + "loss": 2.0788, + "num_input_tokens_seen": 63087914624, + "step": 120350 + }, + { + "epoch": 1.1486244439939421, + "grad_norm": 0.13261480629444122, + "learning_rate": 0.0007938926261462366, + "loss": 2.0802, + "num_input_tokens_seen": 63114128096, + "step": 120400 + }, + { + "epoch": 1.1491014465101304, + "grad_norm": 0.14857733249664307, + "learning_rate": 0.0007916187260114262, + "loss": 2.0773, + "num_input_tokens_seen": 63140341024, + "step": 120450 + }, + { + "epoch": 1.1495784490263186, + "grad_norm": 0.13263733685016632, + "learning_rate": 0.000789335648089903, + "loss": 2.0796, + "num_input_tokens_seen": 63166554368, + "step": 120500 + }, + { + "epoch": 1.1495784490263186, + "eval_loss": 1.9961134195327759, + "eval_runtime": 82.5305, + "eval_samples_per_second": 60.584, + "eval_steps_per_second": 15.146, + "num_input_tokens_seen": 63166554368, + "step": 120500 + }, + { + "epoch": 1.150055451542507, + "grad_norm": 0.13879702985286713, + "learning_rate": 0.0007870434642343984, + "loss": 2.0783, + "num_input_tokens_seen": 63192764288, + "step": 120550 + }, + { + "epoch": 1.1505324540586952, + "grad_norm": 0.13164860010147095, + "learning_rate": 0.000784742246584226, + "loss": 2.081, + "num_input_tokens_seen": 63218969504, + "step": 120600 + }, + { + "epoch": 1.1510094565748834, + "grad_norm": 0.1406654268503189, + "learning_rate": 0.0007824320675630089, + "loss": 2.0704, + "num_input_tokens_seen": 63245179680, + "step": 120650 + }, + { + "epoch": 1.1514864590910716, + "grad_norm": 0.13722951710224152, + "learning_rate": 0.0007801129998764014, + "loss": 2.0693, + "num_input_tokens_seen": 63271389024, + "step": 120700 + }, + { + "epoch": 1.15196346160726, + "grad_norm": 0.15168820321559906, + "learning_rate": 0.0007777851165098011, + "loss": 2.0813, + "num_input_tokens_seen": 63297594624, + "step": 120750 + }, + { + "epoch": 1.1524404641234482, + "grad_norm": 0.13907547295093536, + "learning_rate": 0.0007754484907260512, + "loss": 2.0747, + "num_input_tokens_seen": 63323809024, + "step": 120800 + }, + { + "epoch": 1.1529174666396365, + "grad_norm": 0.13827022910118103, + "learning_rate": 0.0007731031960631354, + "loss": 2.079, + "num_input_tokens_seen": 63350015808, + "step": 120850 + }, + { + "epoch": 1.1533944691558249, + "grad_norm": 0.1326221376657486, + "learning_rate": 0.0007707493063318629, + "loss": 2.0856, + "num_input_tokens_seen": 63376227968, + "step": 120900 + }, + { + "epoch": 1.153871471672013, + "grad_norm": 0.13669894635677338, + "learning_rate": 0.000768386895613546, + "loss": 2.0691, + "num_input_tokens_seen": 63402433504, + "step": 120950 + }, + { + "epoch": 1.1543484741882013, + "grad_norm": 0.1403321623802185, + "learning_rate": 0.0007660160382576683, + "loss": 2.077, + "num_input_tokens_seen": 63428647904, + "step": 121000 + }, + { + "epoch": 1.1543484741882013, + "eval_loss": 1.9939944744110107, + "eval_runtime": 82.7663, + "eval_samples_per_second": 60.411, + "eval_steps_per_second": 15.103, + "num_input_tokens_seen": 63428647904, + "step": 121000 + }, + { + "epoch": 1.1548254767043895, + "grad_norm": 0.1527141034603119, + "learning_rate": 0.000763636808879545, + "loss": 2.0812, + "num_input_tokens_seen": 63454858592, + "step": 121050 + }, + { + "epoch": 1.155302479220578, + "grad_norm": 0.14409616589546204, + "learning_rate": 0.0007612492823579744, + "loss": 2.0757, + "num_input_tokens_seen": 63481069536, + "step": 121100 + }, + { + "epoch": 1.1557794817367661, + "grad_norm": 0.1311630755662918, + "learning_rate": 0.0007588535338328816, + "loss": 2.0714, + "num_input_tokens_seen": 63507276640, + "step": 121150 + }, + { + "epoch": 1.1562564842529544, + "grad_norm": 0.12864112854003906, + "learning_rate": 0.0007564496387029531, + "loss": 2.0703, + "num_input_tokens_seen": 63533491040, + "step": 121200 + }, + { + "epoch": 1.1567334867691428, + "grad_norm": 0.1277550309896469, + "learning_rate": 0.0007540376726232647, + "loss": 2.0833, + "num_input_tokens_seen": 63559699712, + "step": 121250 + }, + { + "epoch": 1.157210489285331, + "grad_norm": 0.13141444325447083, + "learning_rate": 0.0007516177115029001, + "loss": 2.0755, + "num_input_tokens_seen": 63585905408, + "step": 121300 + }, + { + "epoch": 1.1576874918015192, + "grad_norm": 0.13436725735664368, + "learning_rate": 0.0007491898315025615, + "loss": 2.0716, + "num_input_tokens_seen": 63612116704, + "step": 121350 + }, + { + "epoch": 1.1581644943177074, + "grad_norm": 0.13668642938137054, + "learning_rate": 0.0007467541090321735, + "loss": 2.0766, + "num_input_tokens_seen": 63638330048, + "step": 121400 + }, + { + "epoch": 1.1586414968338958, + "grad_norm": 0.22589260339736938, + "learning_rate": 0.0007443106207484776, + "loss": 2.0793, + "num_input_tokens_seen": 63664542944, + "step": 121450 + }, + { + "epoch": 1.159118499350084, + "grad_norm": 0.14154261350631714, + "learning_rate": 0.00074185944355262, + "loss": 2.0938, + "num_input_tokens_seen": 63690757024, + "step": 121500 + }, + { + "epoch": 1.159118499350084, + "eval_loss": 1.9929685592651367, + "eval_runtime": 82.8366, + "eval_samples_per_second": 60.36, + "eval_steps_per_second": 15.09, + "num_input_tokens_seen": 63690757024, + "step": 121500 + }, + { + "epoch": 1.1595955018662722, + "grad_norm": 0.13303405046463013, + "learning_rate": 0.0007394006545877314, + "loss": 2.078, + "num_input_tokens_seen": 63716968288, + "step": 121550 + }, + { + "epoch": 1.1600725043824607, + "grad_norm": 0.12762907147407532, + "learning_rate": 0.0007369343312364993, + "loss": 2.0757, + "num_input_tokens_seen": 63743181728, + "step": 121600 + }, + { + "epoch": 1.1605495068986489, + "grad_norm": 0.160507932305336, + "learning_rate": 0.0007344605511187322, + "loss": 2.076, + "num_input_tokens_seen": 63769396128, + "step": 121650 + }, + { + "epoch": 1.161026509414837, + "grad_norm": 0.14160197973251343, + "learning_rate": 0.0007319793920889171, + "loss": 2.0762, + "num_input_tokens_seen": 63795607296, + "step": 121700 + }, + { + "epoch": 1.1615035119310255, + "grad_norm": 0.15858200192451477, + "learning_rate": 0.0007294909322337689, + "loss": 2.08, + "num_input_tokens_seen": 63821818336, + "step": 121750 + }, + { + "epoch": 1.1619805144472137, + "grad_norm": 0.13940422236919403, + "learning_rate": 0.0007269952498697733, + "loss": 2.0816, + "num_input_tokens_seen": 63848031552, + "step": 121800 + }, + { + "epoch": 1.162457516963402, + "grad_norm": 0.13600219786167145, + "learning_rate": 0.0007244924235407223, + "loss": 2.0757, + "num_input_tokens_seen": 63874245952, + "step": 121850 + }, + { + "epoch": 1.1629345194795904, + "grad_norm": 0.14759120345115662, + "learning_rate": 0.0007219825320152411, + "loss": 2.0883, + "num_input_tokens_seen": 63900453792, + "step": 121900 + }, + { + "epoch": 1.1634115219957786, + "grad_norm": 0.12860442698001862, + "learning_rate": 0.0007194656542843102, + "loss": 2.0802, + "num_input_tokens_seen": 63926661920, + "step": 121950 + }, + { + "epoch": 1.1638885245119668, + "grad_norm": 0.13766394555568695, + "learning_rate": 0.0007169418695587791, + "loss": 2.072, + "num_input_tokens_seen": 63952872768, + "step": 122000 + }, + { + "epoch": 1.1638885245119668, + "eval_loss": 1.991066813468933, + "eval_runtime": 82.2634, + "eval_samples_per_second": 60.78, + "eval_steps_per_second": 15.195, + "num_input_tokens_seen": 63952872768, + "step": 122000 + }, + { + "epoch": 1.164365527028155, + "grad_norm": 0.13863904774188995, + "learning_rate": 0.0007144112572668733, + "loss": 2.0703, + "num_input_tokens_seen": 63979084224, + "step": 122050 + }, + { + "epoch": 1.1648425295443434, + "grad_norm": 0.1426379680633545, + "learning_rate": 0.0007118738970516943, + "loss": 2.0766, + "num_input_tokens_seen": 64005286944, + "step": 122100 + }, + { + "epoch": 1.1653195320605316, + "grad_norm": 0.13977181911468506, + "learning_rate": 0.0007093298687687141, + "loss": 2.0692, + "num_input_tokens_seen": 64031487744, + "step": 122150 + }, + { + "epoch": 1.1657965345767198, + "grad_norm": 0.1425238400697708, + "learning_rate": 0.0007067792524832604, + "loss": 2.0662, + "num_input_tokens_seen": 64057695552, + "step": 122200 + }, + { + "epoch": 1.1662735370929083, + "grad_norm": 0.15061677992343903, + "learning_rate": 0.0007042221284679982, + "loss": 2.0781, + "num_input_tokens_seen": 64083893664, + "step": 122250 + }, + { + "epoch": 1.1667505396090965, + "grad_norm": 0.12374892085790634, + "learning_rate": 0.0007016585772004026, + "loss": 2.0745, + "num_input_tokens_seen": 64110107392, + "step": 122300 + }, + { + "epoch": 1.1672275421252847, + "grad_norm": 0.1427278071641922, + "learning_rate": 0.0006990886793602267, + "loss": 2.0861, + "num_input_tokens_seen": 64136321792, + "step": 122350 + }, + { + "epoch": 1.1677045446414729, + "grad_norm": 0.15141050517559052, + "learning_rate": 0.0006965125158269618, + "loss": 2.0767, + "num_input_tokens_seen": 64162534656, + "step": 122400 + }, + { + "epoch": 1.1681815471576613, + "grad_norm": 0.13262976706027985, + "learning_rate": 0.0006939301676772927, + "loss": 2.0662, + "num_input_tokens_seen": 64188740064, + "step": 122450 + }, + { + "epoch": 1.1686585496738495, + "grad_norm": 0.13390204310417175, + "learning_rate": 0.000691341716182545, + "loss": 2.0684, + "num_input_tokens_seen": 64214942816, + "step": 122500 + }, + { + "epoch": 1.1686585496738495, + "eval_loss": 1.9892343282699585, + "eval_runtime": 81.7351, + "eval_samples_per_second": 61.173, + "eval_steps_per_second": 15.293, + "num_input_tokens_seen": 64214942816, + "step": 122500 + }, + { + "epoch": 1.1691355521900377, + "grad_norm": 0.14351387321949005, + "learning_rate": 0.0006887472428061285, + "loss": 2.0611, + "num_input_tokens_seen": 64241151872, + "step": 122550 + }, + { + "epoch": 1.1696125547062262, + "grad_norm": 0.1321556568145752, + "learning_rate": 0.0006861468292009726, + "loss": 2.0726, + "num_input_tokens_seen": 64267354176, + "step": 122600 + }, + { + "epoch": 1.1700895572224144, + "grad_norm": 0.12825502455234528, + "learning_rate": 0.0006835405572069572, + "loss": 2.0703, + "num_input_tokens_seen": 64293568544, + "step": 122650 + }, + { + "epoch": 1.1705665597386026, + "grad_norm": 0.1376345157623291, + "learning_rate": 0.0006809285088483361, + "loss": 2.0789, + "num_input_tokens_seen": 64319782944, + "step": 122700 + }, + { + "epoch": 1.1710435622547908, + "grad_norm": 0.14178837835788727, + "learning_rate": 0.0006783107663311565, + "loss": 2.0755, + "num_input_tokens_seen": 64345996064, + "step": 122750 + }, + { + "epoch": 1.1715205647709792, + "grad_norm": 0.1475340873003006, + "learning_rate": 0.0006756874120406714, + "loss": 2.0668, + "num_input_tokens_seen": 64372202944, + "step": 122800 + }, + { + "epoch": 1.1719975672871674, + "grad_norm": 0.13012921810150146, + "learning_rate": 0.0006730585285387465, + "loss": 2.0618, + "num_input_tokens_seen": 64398414944, + "step": 122850 + }, + { + "epoch": 1.1724745698033556, + "grad_norm": 0.13203522562980652, + "learning_rate": 0.0006704241985612625, + "loss": 2.0712, + "num_input_tokens_seen": 64424627264, + "step": 122900 + }, + { + "epoch": 1.172951572319544, + "grad_norm": 0.13648848235607147, + "learning_rate": 0.0006677845050155106, + "loss": 2.0694, + "num_input_tokens_seen": 64450839392, + "step": 122950 + }, + { + "epoch": 1.1734285748357323, + "grad_norm": 0.1383182257413864, + "learning_rate": 0.0006651395309775837, + "loss": 2.0564, + "num_input_tokens_seen": 64477051392, + "step": 123000 + }, + { + "epoch": 1.1734285748357323, + "eval_loss": 1.9881237745285034, + "eval_runtime": 82.9953, + "eval_samples_per_second": 60.244, + "eval_steps_per_second": 15.061, + "num_input_tokens_seen": 64477051392, + "step": 123000 + }, + { + "epoch": 1.1739055773519205, + "grad_norm": 0.14069771766662598, + "learning_rate": 0.0006624893596897613, + "loss": 2.0767, + "num_input_tokens_seen": 64503259872, + "step": 123050 + }, + { + "epoch": 1.174382579868109, + "grad_norm": 0.14180107414722443, + "learning_rate": 0.0006598340745578908, + "loss": 2.0611, + "num_input_tokens_seen": 64529460896, + "step": 123100 + }, + { + "epoch": 1.174859582384297, + "grad_norm": 0.14584094285964966, + "learning_rate": 0.000657173759148761, + "loss": 2.0693, + "num_input_tokens_seen": 64555675296, + "step": 123150 + }, + { + "epoch": 1.1753365849004853, + "grad_norm": 0.1269799768924713, + "learning_rate": 0.0006545084971874737, + "loss": 2.0615, + "num_input_tokens_seen": 64581882720, + "step": 123200 + }, + { + "epoch": 1.1758135874166737, + "grad_norm": 0.15073458850383759, + "learning_rate": 0.0006518383725548074, + "loss": 2.083, + "num_input_tokens_seen": 64608088736, + "step": 123250 + }, + { + "epoch": 1.176290589932862, + "grad_norm": 0.12902715802192688, + "learning_rate": 0.000649163469284578, + "loss": 2.0579, + "num_input_tokens_seen": 64634299936, + "step": 123300 + }, + { + "epoch": 1.1767675924490502, + "grad_norm": 0.13666096329689026, + "learning_rate": 0.0006464838715609945, + "loss": 2.0673, + "num_input_tokens_seen": 64660511904, + "step": 123350 + }, + { + "epoch": 1.1772445949652384, + "grad_norm": 0.13477379083633423, + "learning_rate": 0.0006437996637160086, + "loss": 2.0752, + "num_input_tokens_seen": 64686718272, + "step": 123400 + }, + { + "epoch": 1.1777215974814268, + "grad_norm": 0.13596594333648682, + "learning_rate": 0.0006411109302266615, + "loss": 2.0606, + "num_input_tokens_seen": 64712932256, + "step": 123450 + }, + { + "epoch": 1.178198599997615, + "grad_norm": 0.1400011032819748, + "learning_rate": 0.0006384177557124247, + "loss": 2.066, + "num_input_tokens_seen": 64739145440, + "step": 123500 + }, + { + "epoch": 1.178198599997615, + "eval_loss": 1.986546516418457, + "eval_runtime": 82.7963, + "eval_samples_per_second": 60.389, + "eval_steps_per_second": 15.097, + "num_input_tokens_seen": 64739145440, + "step": 123500 + }, + { + "epoch": 1.1786756025138032, + "grad_norm": 0.13023069500923157, + "learning_rate": 0.0006357202249325371, + "loss": 2.0727, + "num_input_tokens_seen": 64765359840, + "step": 123550 + }, + { + "epoch": 1.1791526050299916, + "grad_norm": 0.13744056224822998, + "learning_rate": 0.0006330184227833376, + "loss": 2.0603, + "num_input_tokens_seen": 64791573504, + "step": 123600 + }, + { + "epoch": 1.1796296075461798, + "grad_norm": 0.1399419903755188, + "learning_rate": 0.0006303124342955927, + "loss": 2.0699, + "num_input_tokens_seen": 64817787904, + "step": 123650 + }, + { + "epoch": 1.180106610062368, + "grad_norm": 0.13453304767608643, + "learning_rate": 0.0006276023446318213, + "loss": 2.0764, + "num_input_tokens_seen": 64844002304, + "step": 123700 + }, + { + "epoch": 1.1805836125785563, + "grad_norm": 0.13495005667209625, + "learning_rate": 0.0006248882390836135, + "loss": 2.0629, + "num_input_tokens_seen": 64870216704, + "step": 123750 + }, + { + "epoch": 1.1810606150947447, + "grad_norm": 0.14330346882343292, + "learning_rate": 0.000622170203068947, + "loss": 2.0677, + "num_input_tokens_seen": 64896426784, + "step": 123800 + }, + { + "epoch": 1.181537617610933, + "grad_norm": 0.13179130852222443, + "learning_rate": 0.0006194483221294988, + "loss": 2.0568, + "num_input_tokens_seen": 64922636000, + "step": 123850 + }, + { + "epoch": 1.182014620127121, + "grad_norm": 0.12518762052059174, + "learning_rate": 0.0006167226819279528, + "loss": 2.0604, + "num_input_tokens_seen": 64948840416, + "step": 123900 + }, + { + "epoch": 1.1824916226433095, + "grad_norm": 0.12823528051376343, + "learning_rate": 0.0006139933682453035, + "loss": 2.0683, + "num_input_tokens_seen": 64975054816, + "step": 123950 + }, + { + "epoch": 1.1829686251594977, + "grad_norm": 0.1308305859565735, + "learning_rate": 0.0006112604669781572, + "loss": 2.0639, + "num_input_tokens_seen": 65001257824, + "step": 124000 + }, + { + "epoch": 1.1829686251594977, + "eval_loss": 1.9843353033065796, + "eval_runtime": 82.7751, + "eval_samples_per_second": 60.405, + "eval_steps_per_second": 15.101, + "num_input_tokens_seen": 65001257824, + "step": 124000 + }, + { + "epoch": 1.183445627675686, + "grad_norm": 0.12966303527355194, + "learning_rate": 0.0006085240641360281, + "loss": 2.0655, + "num_input_tokens_seen": 65027466432, + "step": 124050 + }, + { + "epoch": 1.1839226301918742, + "grad_norm": 0.13216206431388855, + "learning_rate": 0.0006057842458386314, + "loss": 2.0787, + "num_input_tokens_seen": 65053680192, + "step": 124100 + }, + { + "epoch": 1.1843996327080626, + "grad_norm": 0.13295891880989075, + "learning_rate": 0.0006030410983131733, + "loss": 2.0654, + "num_input_tokens_seen": 65079892928, + "step": 124150 + }, + { + "epoch": 1.1848766352242508, + "grad_norm": 0.14478819072246552, + "learning_rate": 0.0006002947078916364, + "loss": 2.0638, + "num_input_tokens_seen": 65106107328, + "step": 124200 + }, + { + "epoch": 1.185353637740439, + "grad_norm": 0.13410045206546783, + "learning_rate": 0.0005975451610080642, + "loss": 2.0711, + "num_input_tokens_seen": 65132321728, + "step": 124250 + }, + { + "epoch": 1.1858306402566274, + "grad_norm": 0.14699777960777283, + "learning_rate": 0.0005947925441958392, + "loss": 2.0574, + "num_input_tokens_seen": 65158534656, + "step": 124300 + }, + { + "epoch": 1.1863076427728156, + "grad_norm": 0.13368327915668488, + "learning_rate": 0.0005920369440849609, + "loss": 2.0626, + "num_input_tokens_seen": 65184748736, + "step": 124350 + }, + { + "epoch": 1.1867846452890038, + "grad_norm": 0.13047395646572113, + "learning_rate": 0.0005892784473993184, + "loss": 2.06, + "num_input_tokens_seen": 65210950912, + "step": 124400 + }, + { + "epoch": 1.187261647805192, + "grad_norm": 0.13072432577610016, + "learning_rate": 0.0005865171409539613, + "loss": 2.0869, + "num_input_tokens_seen": 65237165312, + "step": 124450 + }, + { + "epoch": 1.1877386503213805, + "grad_norm": 0.14443765580654144, + "learning_rate": 0.0005837531116523682, + "loss": 2.0675, + "num_input_tokens_seen": 65263378112, + "step": 124500 + }, + { + "epoch": 1.1877386503213805, + "eval_loss": 1.9832085371017456, + "eval_runtime": 83.5278, + "eval_samples_per_second": 59.86, + "eval_steps_per_second": 14.965, + "num_input_tokens_seen": 65263378112, + "step": 124500 + }, + { + "epoch": 1.1882156528375687, + "grad_norm": 0.13271184265613556, + "learning_rate": 0.0005809864464837105, + "loss": 2.0507, + "num_input_tokens_seen": 65289588448, + "step": 124550 + }, + { + "epoch": 1.188692655353757, + "grad_norm": 0.13720299303531647, + "learning_rate": 0.0005782172325201155, + "loss": 2.0728, + "num_input_tokens_seen": 65315802432, + "step": 124600 + }, + { + "epoch": 1.1891696578699453, + "grad_norm": 0.12747812271118164, + "learning_rate": 0.0005754455569139257, + "loss": 2.0786, + "num_input_tokens_seen": 65342011648, + "step": 124650 + }, + { + "epoch": 1.1896466603861335, + "grad_norm": 0.13649390637874603, + "learning_rate": 0.0005726715068949564, + "loss": 2.0578, + "num_input_tokens_seen": 65368225184, + "step": 124700 + }, + { + "epoch": 1.1901236629023217, + "grad_norm": 0.13283640146255493, + "learning_rate": 0.0005698951697677498, + "loss": 2.0616, + "num_input_tokens_seen": 65394434464, + "step": 124750 + }, + { + "epoch": 1.1906006654185102, + "grad_norm": 0.13304251432418823, + "learning_rate": 0.0005671166329088278, + "loss": 2.0657, + "num_input_tokens_seen": 65420648864, + "step": 124800 + }, + { + "epoch": 1.1910776679346984, + "grad_norm": 0.1442023664712906, + "learning_rate": 0.000564335983763942, + "loss": 2.0584, + "num_input_tokens_seen": 65446854944, + "step": 124850 + }, + { + "epoch": 1.1915546704508866, + "grad_norm": 0.13637055456638336, + "learning_rate": 0.0005615533098453215, + "loss": 2.0719, + "num_input_tokens_seen": 65473067296, + "step": 124900 + }, + { + "epoch": 1.192031672967075, + "grad_norm": 0.13165481388568878, + "learning_rate": 0.0005587686987289189, + "loss": 2.0594, + "num_input_tokens_seen": 65499281184, + "step": 124950 + }, + { + "epoch": 1.1925086754832632, + "grad_norm": 0.14200669527053833, + "learning_rate": 0.0005559822380516539, + "loss": 2.0692, + "num_input_tokens_seen": 65525493280, + "step": 125000 + }, + { + "epoch": 1.1925086754832632, + "eval_loss": 1.982203722000122, + "eval_runtime": 82.3332, + "eval_samples_per_second": 60.729, + "eval_steps_per_second": 15.182, + "num_input_tokens_seen": 65525493280, + "step": 125000 + }, + { + "epoch": 1.1929856779994514, + "grad_norm": 0.13272584974765778, + "learning_rate": 0.0005531940155086557, + "loss": 2.0602, + "num_input_tokens_seen": 65551700064, + "step": 125050 + }, + { + "epoch": 1.1934626805156396, + "grad_norm": 0.14066773653030396, + "learning_rate": 0.0005504041188505022, + "loss": 2.0695, + "num_input_tokens_seen": 65577910784, + "step": 125100 + }, + { + "epoch": 1.193939683031828, + "grad_norm": 0.13133113086223602, + "learning_rate": 0.0005476126358804593, + "loss": 2.0686, + "num_input_tokens_seen": 65604124224, + "step": 125150 + }, + { + "epoch": 1.1944166855480163, + "grad_norm": 0.13990654051303864, + "learning_rate": 0.0005448196544517168, + "loss": 2.0532, + "num_input_tokens_seen": 65630324960, + "step": 125200 + }, + { + "epoch": 1.1948936880642045, + "grad_norm": 0.14154765009880066, + "learning_rate": 0.0005420252624646238, + "loss": 2.0518, + "num_input_tokens_seen": 65656532992, + "step": 125250 + }, + { + "epoch": 1.195370690580393, + "grad_norm": 0.13149969279766083, + "learning_rate": 0.0005392295478639225, + "loss": 2.0619, + "num_input_tokens_seen": 65682736768, + "step": 125300 + }, + { + "epoch": 1.1958476930965811, + "grad_norm": 0.1339765191078186, + "learning_rate": 0.0005364325986359802, + "loss": 2.0706, + "num_input_tokens_seen": 65708951168, + "step": 125350 + }, + { + "epoch": 1.1963246956127693, + "grad_norm": 0.13910150527954102, + "learning_rate": 0.0005336345028060199, + "loss": 2.0596, + "num_input_tokens_seen": 65735165568, + "step": 125400 + }, + { + "epoch": 1.1968016981289575, + "grad_norm": 0.1447630077600479, + "learning_rate": 0.0005308353484353508, + "loss": 2.0518, + "num_input_tokens_seen": 65761369888, + "step": 125450 + }, + { + "epoch": 1.197278700645146, + "grad_norm": 0.13201679289340973, + "learning_rate": 0.0005280352236185959, + "loss": 2.0645, + "num_input_tokens_seen": 65787582144, + "step": 125500 + }, + { + "epoch": 1.197278700645146, + "eval_loss": 1.9799100160598755, + "eval_runtime": 83.01, + "eval_samples_per_second": 60.234, + "eval_steps_per_second": 15.058, + "num_input_tokens_seen": 65787582144, + "step": 125500 + }, + { + "epoch": 1.1977557031613342, + "grad_norm": 0.1335040032863617, + "learning_rate": 0.0005252342164809204, + "loss": 2.0597, + "num_input_tokens_seen": 65813796352, + "step": 125550 + }, + { + "epoch": 1.1982327056775224, + "grad_norm": 0.13693130016326904, + "learning_rate": 0.0005224324151752575, + "loss": 2.0594, + "num_input_tokens_seen": 65840010208, + "step": 125600 + }, + { + "epoch": 1.1987097081937108, + "grad_norm": 0.13866880536079407, + "learning_rate": 0.0005196299078795343, + "loss": 2.0511, + "num_input_tokens_seen": 65866216672, + "step": 125650 + }, + { + "epoch": 1.199186710709899, + "grad_norm": 0.12740108370780945, + "learning_rate": 0.000516826782793897, + "loss": 2.0607, + "num_input_tokens_seen": 65892430944, + "step": 125700 + }, + { + "epoch": 1.1996637132260872, + "grad_norm": 0.13575108349323273, + "learning_rate": 0.0005140231281379345, + "loss": 2.0555, + "num_input_tokens_seen": 65918642496, + "step": 125750 + }, + { + "epoch": 1.2001407157422754, + "grad_norm": 0.13791455328464508, + "learning_rate": 0.0005112190321479025, + "loss": 2.0632, + "num_input_tokens_seen": 65944852960, + "step": 125800 + }, + { + "epoch": 1.2006177182584639, + "grad_norm": 0.1315431296825409, + "learning_rate": 0.0005084145830739461, + "loss": 2.0646, + "num_input_tokens_seen": 65971066432, + "step": 125850 + }, + { + "epoch": 1.201094720774652, + "grad_norm": 0.12288303673267365, + "learning_rate": 0.000505609869177323, + "loss": 2.0748, + "num_input_tokens_seen": 65997277888, + "step": 125900 + }, + { + "epoch": 1.2015717232908403, + "grad_norm": 0.12677106261253357, + "learning_rate": 0.0005028049787276249, + "loss": 2.0595, + "num_input_tokens_seen": 66023480960, + "step": 125950 + }, + { + "epoch": 1.2020487258070287, + "grad_norm": 0.140994593501091, + "learning_rate": 0.0005, + "loss": 2.0556, + "num_input_tokens_seen": 66049692768, + "step": 126000 + }, + { + "epoch": 1.2020487258070287, + "eval_loss": 1.978381633758545, + "eval_runtime": 81.8164, + "eval_samples_per_second": 61.112, + "eval_steps_per_second": 15.278, + "num_input_tokens_seen": 66049692768, + "step": 126000 + }, + { + "epoch": 1.202525728323217, + "grad_norm": 0.1393454372882843, + "learning_rate": 0.0004971950212723752, + "loss": 2.0569, + "num_input_tokens_seen": 66075907072, + "step": 126050 + }, + { + "epoch": 1.2030027308394051, + "grad_norm": 0.1390795111656189, + "learning_rate": 0.0004943901308226771, + "loss": 2.0579, + "num_input_tokens_seen": 66102120320, + "step": 126100 + }, + { + "epoch": 1.2034797333555933, + "grad_norm": 0.136804461479187, + "learning_rate": 0.0004915854169260539, + "loss": 2.0594, + "num_input_tokens_seen": 66128330880, + "step": 126150 + }, + { + "epoch": 1.2039567358717818, + "grad_norm": 0.14418946206569672, + "learning_rate": 0.0004887809678520976, + "loss": 2.0521, + "num_input_tokens_seen": 66154537216, + "step": 126200 + }, + { + "epoch": 1.20443373838797, + "grad_norm": 0.1406649798154831, + "learning_rate": 0.00048597687186206556, + "loss": 2.0604, + "num_input_tokens_seen": 66180744192, + "step": 126250 + }, + { + "epoch": 1.2049107409041582, + "grad_norm": 0.13004782795906067, + "learning_rate": 0.0004831732172061032, + "loss": 2.0633, + "num_input_tokens_seen": 66206951232, + "step": 126300 + }, + { + "epoch": 1.2053877434203466, + "grad_norm": 0.1319655478000641, + "learning_rate": 0.00048037009212046586, + "loss": 2.0609, + "num_input_tokens_seen": 66233151744, + "step": 126350 + }, + { + "epoch": 1.2058647459365348, + "grad_norm": 0.13051386177539825, + "learning_rate": 0.0004775675848247427, + "loss": 2.0591, + "num_input_tokens_seen": 66259358592, + "step": 126400 + }, + { + "epoch": 1.206341748452723, + "grad_norm": 0.12983474135398865, + "learning_rate": 0.0004747657835190795, + "loss": 2.0571, + "num_input_tokens_seen": 66285559520, + "step": 126450 + }, + { + "epoch": 1.2068187509689114, + "grad_norm": 0.12744031846523285, + "learning_rate": 0.00047196477638140405, + "loss": 2.0581, + "num_input_tokens_seen": 66311770112, + "step": 126500 + }, + { + "epoch": 1.2068187509689114, + "eval_loss": 1.9767038822174072, + "eval_runtime": 82.0094, + "eval_samples_per_second": 60.969, + "eval_steps_per_second": 15.242, + "num_input_tokens_seen": 66311770112, + "step": 126500 + }, + { + "epoch": 1.2072957534850997, + "grad_norm": 0.13606679439544678, + "learning_rate": 0.00046916465156464924, + "loss": 2.062, + "num_input_tokens_seen": 66337979200, + "step": 126550 + }, + { + "epoch": 1.2077727560012879, + "grad_norm": 0.12876896560192108, + "learning_rate": 0.0004663654971939802, + "loss": 2.0627, + "num_input_tokens_seen": 66364192640, + "step": 126600 + }, + { + "epoch": 1.2082497585174763, + "grad_norm": 0.18826884031295776, + "learning_rate": 0.00046356740136402, + "loss": 2.0573, + "num_input_tokens_seen": 66390404768, + "step": 126650 + }, + { + "epoch": 1.2087267610336645, + "grad_norm": 0.1488431692123413, + "learning_rate": 0.0004607704521360776, + "loss": 2.0592, + "num_input_tokens_seen": 66416613920, + "step": 126700 + }, + { + "epoch": 1.2092037635498527, + "grad_norm": 0.12901978194713593, + "learning_rate": 0.0004579747375353763, + "loss": 2.0601, + "num_input_tokens_seen": 66442820800, + "step": 126750 + }, + { + "epoch": 1.209680766066041, + "grad_norm": 0.13032038509845734, + "learning_rate": 0.0004551803455482833, + "loss": 2.0675, + "num_input_tokens_seen": 66469028480, + "step": 126800 + }, + { + "epoch": 1.2101577685822293, + "grad_norm": 0.13756315410137177, + "learning_rate": 0.00045238736411954073, + "loss": 2.0543, + "num_input_tokens_seen": 66495230816, + "step": 126850 + }, + { + "epoch": 1.2106347710984176, + "grad_norm": 0.13066066801548004, + "learning_rate": 0.0004495958811494978, + "loss": 2.0545, + "num_input_tokens_seen": 66521443360, + "step": 126900 + }, + { + "epoch": 1.2111117736146058, + "grad_norm": 0.13837099075317383, + "learning_rate": 0.00044680598449134434, + "loss": 2.0557, + "num_input_tokens_seen": 66547651488, + "step": 126950 + }, + { + "epoch": 1.2115887761307942, + "grad_norm": 0.13125094771385193, + "learning_rate": 0.0004440177619483461, + "loss": 2.0633, + "num_input_tokens_seen": 66573856704, + "step": 127000 + }, + { + "epoch": 1.2115887761307942, + "eval_loss": 1.9741461277008057, + "eval_runtime": 82.3333, + "eval_samples_per_second": 60.729, + "eval_steps_per_second": 15.182, + "num_input_tokens_seen": 66573856704, + "step": 127000 + }, + { + "epoch": 1.2120657786469824, + "grad_norm": 0.13154049217700958, + "learning_rate": 0.00044123130127108126, + "loss": 2.0525, + "num_input_tokens_seen": 66600067712, + "step": 127050 + }, + { + "epoch": 1.2125427811631706, + "grad_norm": 0.13129626214504242, + "learning_rate": 0.00043844669015467863, + "loss": 2.0411, + "num_input_tokens_seen": 66626274400, + "step": 127100 + }, + { + "epoch": 1.2130197836793588, + "grad_norm": 0.12721647322177887, + "learning_rate": 0.0004356640162360581, + "loss": 2.0469, + "num_input_tokens_seen": 66652487040, + "step": 127150 + }, + { + "epoch": 1.2134967861955472, + "grad_norm": 0.1383296549320221, + "learning_rate": 0.0004328833670911724, + "loss": 2.0578, + "num_input_tokens_seen": 66678700288, + "step": 127200 + }, + { + "epoch": 1.2139737887117354, + "grad_norm": 0.12966816127300262, + "learning_rate": 0.00043010483023225046, + "loss": 2.0544, + "num_input_tokens_seen": 66704910336, + "step": 127250 + }, + { + "epoch": 1.2144507912279237, + "grad_norm": 0.13144998252391815, + "learning_rate": 0.0004273284931050438, + "loss": 2.061, + "num_input_tokens_seen": 66731122112, + "step": 127300 + }, + { + "epoch": 1.214927793744112, + "grad_norm": 0.13422222435474396, + "learning_rate": 0.0004245544430860743, + "loss": 2.062, + "num_input_tokens_seen": 66757331872, + "step": 127350 + }, + { + "epoch": 1.2154047962603003, + "grad_norm": 0.1333204060792923, + "learning_rate": 0.0004217827674798845, + "loss": 2.0538, + "num_input_tokens_seen": 66783545248, + "step": 127400 + }, + { + "epoch": 1.2158817987764885, + "grad_norm": 0.13239559531211853, + "learning_rate": 0.0004190135535162894, + "loss": 2.0545, + "num_input_tokens_seen": 66809758656, + "step": 127450 + }, + { + "epoch": 1.2163588012926767, + "grad_norm": 0.13535359501838684, + "learning_rate": 0.00041624688834763184, + "loss": 2.0625, + "num_input_tokens_seen": 66835970592, + "step": 127500 + }, + { + "epoch": 1.2163588012926767, + "eval_loss": 1.9728902578353882, + "eval_runtime": 82.272, + "eval_samples_per_second": 60.774, + "eval_steps_per_second": 15.194, + "num_input_tokens_seen": 66835970592, + "step": 127500 + }, + { + "epoch": 1.2168358038088651, + "grad_norm": 0.1306886225938797, + "learning_rate": 0.0004134828590460387, + "loss": 2.0548, + "num_input_tokens_seen": 66862174016, + "step": 127550 + }, + { + "epoch": 1.2173128063250533, + "grad_norm": 0.1322244554758072, + "learning_rate": 0.0004107215526006817, + "loss": 2.0544, + "num_input_tokens_seen": 66888384224, + "step": 127600 + }, + { + "epoch": 1.2177898088412416, + "grad_norm": 0.13241881132125854, + "learning_rate": 0.0004079630559150391, + "loss": 2.0646, + "num_input_tokens_seen": 66914597888, + "step": 127650 + }, + { + "epoch": 1.21826681135743, + "grad_norm": 0.12745130062103271, + "learning_rate": 0.0004052074558041608, + "loss": 2.0554, + "num_input_tokens_seen": 66940807552, + "step": 127700 + }, + { + "epoch": 1.2187438138736182, + "grad_norm": 0.13167862594127655, + "learning_rate": 0.00040245483899193594, + "loss": 2.0449, + "num_input_tokens_seen": 66967017376, + "step": 127750 + }, + { + "epoch": 1.2192208163898064, + "grad_norm": 0.1641312688589096, + "learning_rate": 0.00039970529210836363, + "loss": 2.0438, + "num_input_tokens_seen": 66993229600, + "step": 127800 + }, + { + "epoch": 1.2196978189059948, + "grad_norm": 0.1290162205696106, + "learning_rate": 0.00039695890168682686, + "loss": 2.0633, + "num_input_tokens_seen": 67019433984, + "step": 127850 + }, + { + "epoch": 1.220174821422183, + "grad_norm": 0.12822365760803223, + "learning_rate": 0.0003942157541613686, + "loss": 2.0477, + "num_input_tokens_seen": 67045643168, + "step": 127900 + }, + { + "epoch": 1.2206518239383712, + "grad_norm": 0.13961108028888702, + "learning_rate": 0.0003914759358639719, + "loss": 2.063, + "num_input_tokens_seen": 67071854592, + "step": 127950 + }, + { + "epoch": 1.2211288264545597, + "grad_norm": 0.13082347810268402, + "learning_rate": 0.00038873953302184284, + "loss": 2.0557, + "num_input_tokens_seen": 67098059328, + "step": 128000 + }, + { + "epoch": 1.2211288264545597, + "eval_loss": 1.9715449810028076, + "eval_runtime": 83.7065, + "eval_samples_per_second": 59.733, + "eval_steps_per_second": 14.933, + "num_input_tokens_seen": 67098059328, + "step": 128000 + }, + { + "epoch": 1.2216058289707479, + "grad_norm": 0.13075117766857147, + "learning_rate": 0.00038600663175469667, + "loss": 2.0582, + "num_input_tokens_seen": 67124264448, + "step": 128050 + }, + { + "epoch": 1.222082831486936, + "grad_norm": 0.1297282576560974, + "learning_rate": 0.00038327731807204744, + "loss": 2.0595, + "num_input_tokens_seen": 67150472320, + "step": 128100 + }, + { + "epoch": 1.2225598340031243, + "grad_norm": 0.12640318274497986, + "learning_rate": 0.00038055167787050134, + "loss": 2.0525, + "num_input_tokens_seen": 67176672192, + "step": 128150 + }, + { + "epoch": 1.2230368365193127, + "grad_norm": 0.1315733790397644, + "learning_rate": 0.00037782979693105293, + "loss": 2.0499, + "num_input_tokens_seen": 67202877408, + "step": 128200 + }, + { + "epoch": 1.223513839035501, + "grad_norm": 0.12865200638771057, + "learning_rate": 0.0003751117609163865, + "loss": 2.051, + "num_input_tokens_seen": 67229091168, + "step": 128250 + }, + { + "epoch": 1.2239908415516891, + "grad_norm": 0.1271800547838211, + "learning_rate": 0.00037239765536817873, + "loss": 2.0555, + "num_input_tokens_seen": 67255304768, + "step": 128300 + }, + { + "epoch": 1.2244678440678776, + "grad_norm": 0.13572408258914948, + "learning_rate": 0.0003696875657044073, + "loss": 2.0622, + "num_input_tokens_seen": 67281509184, + "step": 128350 + }, + { + "epoch": 1.2249448465840658, + "grad_norm": 0.12558363378047943, + "learning_rate": 0.0003669815772166625, + "loss": 2.0548, + "num_input_tokens_seen": 67307717088, + "step": 128400 + }, + { + "epoch": 1.225421849100254, + "grad_norm": 0.13062912225723267, + "learning_rate": 0.0003642797750674629, + "loss": 2.0473, + "num_input_tokens_seen": 67333928800, + "step": 128450 + }, + { + "epoch": 1.2258988516164422, + "grad_norm": 0.1351100355386734, + "learning_rate": 0.00036158224428757535, + "loss": 2.0475, + "num_input_tokens_seen": 67360131616, + "step": 128500 + }, + { + "epoch": 1.2258988516164422, + "eval_loss": 1.9701597690582275, + "eval_runtime": 82.4081, + "eval_samples_per_second": 60.674, + "eval_steps_per_second": 15.168, + "num_input_tokens_seen": 67360131616, + "step": 128500 + }, + { + "epoch": 1.2263758541326306, + "grad_norm": 0.13211333751678467, + "learning_rate": 0.00035888906977333857, + "loss": 2.0622, + "num_input_tokens_seen": 67386344736, + "step": 128550 + }, + { + "epoch": 1.2268528566488188, + "grad_norm": 0.12648384273052216, + "learning_rate": 0.0003562003362839914, + "loss": 2.051, + "num_input_tokens_seen": 67412555520, + "step": 128600 + }, + { + "epoch": 1.227329859165007, + "grad_norm": 0.13109999895095825, + "learning_rate": 0.00035351612843900553, + "loss": 2.0529, + "num_input_tokens_seen": 67438769504, + "step": 128650 + }, + { + "epoch": 1.2278068616811955, + "grad_norm": 0.12981992959976196, + "learning_rate": 0.000350836530715422, + "loss": 2.045, + "num_input_tokens_seen": 67464972864, + "step": 128700 + }, + { + "epoch": 1.2282838641973837, + "grad_norm": 0.1246839389204979, + "learning_rate": 0.00034816162744519263, + "loss": 2.0569, + "num_input_tokens_seen": 67491186176, + "step": 128750 + }, + { + "epoch": 1.2287608667135719, + "grad_norm": 0.13077682256698608, + "learning_rate": 0.00034549150281252633, + "loss": 2.0461, + "num_input_tokens_seen": 67517399168, + "step": 128800 + }, + { + "epoch": 1.22923786922976, + "grad_norm": 0.12939219176769257, + "learning_rate": 0.000342826240851239, + "loss": 2.047, + "num_input_tokens_seen": 67543606592, + "step": 128850 + }, + { + "epoch": 1.2297148717459485, + "grad_norm": 0.12711487710475922, + "learning_rate": 0.00034016592544210936, + "loss": 2.0411, + "num_input_tokens_seen": 67569807488, + "step": 128900 + }, + { + "epoch": 1.2301918742621367, + "grad_norm": 0.13154172897338867, + "learning_rate": 0.00033751064031023887, + "loss": 2.0536, + "num_input_tokens_seen": 67596020896, + "step": 128950 + }, + { + "epoch": 1.230668876778325, + "grad_norm": 0.1312495321035385, + "learning_rate": 0.00033486046902241664, + "loss": 2.0558, + "num_input_tokens_seen": 67622231264, + "step": 129000 + }, + { + "epoch": 1.230668876778325, + "eval_loss": 1.9686726331710815, + "eval_runtime": 82.3322, + "eval_samples_per_second": 60.73, + "eval_steps_per_second": 15.182, + "num_input_tokens_seen": 67622231264, + "step": 129000 + }, + { + "epoch": 1.2311458792945134, + "grad_norm": 0.13078469038009644, + "learning_rate": 0.00033221549498448967, + "loss": 2.0474, + "num_input_tokens_seen": 67648445664, + "step": 129050 + }, + { + "epoch": 1.2316228818107016, + "grad_norm": 0.1259986162185669, + "learning_rate": 0.0003295758014387375, + "loss": 2.0605, + "num_input_tokens_seen": 67674654432, + "step": 129100 + }, + { + "epoch": 1.2320998843268898, + "grad_norm": 0.13479039072990417, + "learning_rate": 0.0003269414714612534, + "loss": 2.0499, + "num_input_tokens_seen": 67700854208, + "step": 129150 + }, + { + "epoch": 1.232576886843078, + "grad_norm": 0.12382933497428894, + "learning_rate": 0.0003243125879593286, + "loss": 2.0403, + "num_input_tokens_seen": 67727067232, + "step": 129200 + }, + { + "epoch": 1.2330538893592664, + "grad_norm": 0.13765262067317963, + "learning_rate": 0.0003216892336688435, + "loss": 2.05, + "num_input_tokens_seen": 67753274144, + "step": 129250 + }, + { + "epoch": 1.2335308918754546, + "grad_norm": 0.13626757264137268, + "learning_rate": 0.000319071491151664, + "loss": 2.0533, + "num_input_tokens_seen": 67779485312, + "step": 129300 + }, + { + "epoch": 1.2340078943916428, + "grad_norm": 0.13541923463344574, + "learning_rate": 0.00031645944279304295, + "loss": 2.0502, + "num_input_tokens_seen": 67805697216, + "step": 129350 + }, + { + "epoch": 1.2344848969078313, + "grad_norm": 0.12669889628887177, + "learning_rate": 0.00031385317079902743, + "loss": 2.0434, + "num_input_tokens_seen": 67831908160, + "step": 129400 + }, + { + "epoch": 1.2349618994240195, + "grad_norm": 0.12400075793266296, + "learning_rate": 0.0003112527571938717, + "loss": 2.0556, + "num_input_tokens_seen": 67858116736, + "step": 129450 + }, + { + "epoch": 1.2354389019402077, + "grad_norm": 0.13263045251369476, + "learning_rate": 0.0003086582838174551, + "loss": 2.0405, + "num_input_tokens_seen": 67884327168, + "step": 129500 + }, + { + "epoch": 1.2354389019402077, + "eval_loss": 1.966764211654663, + "eval_runtime": 82.4836, + "eval_samples_per_second": 60.618, + "eval_steps_per_second": 15.155, + "num_input_tokens_seen": 67884327168, + "step": 129500 + }, + { + "epoch": 1.235915904456396, + "grad_norm": 0.12067709863185883, + "learning_rate": 0.00030606983232270746, + "loss": 2.0511, + "num_input_tokens_seen": 67910538880, + "step": 129550 + }, + { + "epoch": 1.2363929069725843, + "grad_norm": 0.13021409511566162, + "learning_rate": 0.0003034874841730382, + "loss": 2.0525, + "num_input_tokens_seen": 67936753280, + "step": 129600 + }, + { + "epoch": 1.2368699094887725, + "grad_norm": 0.12661676108837128, + "learning_rate": 0.0003009113206397734, + "loss": 2.0575, + "num_input_tokens_seen": 67962958784, + "step": 129650 + }, + { + "epoch": 1.237346912004961, + "grad_norm": 0.12730489671230316, + "learning_rate": 0.0002983414227995975, + "loss": 2.0552, + "num_input_tokens_seen": 67989169536, + "step": 129700 + }, + { + "epoch": 1.2378239145211491, + "grad_norm": 0.12583428621292114, + "learning_rate": 0.000295777871532002, + "loss": 2.0413, + "num_input_tokens_seen": 68015382560, + "step": 129750 + }, + { + "epoch": 1.2383009170373374, + "grad_norm": 0.12833881378173828, + "learning_rate": 0.00029322074751673977, + "loss": 2.0456, + "num_input_tokens_seen": 68041596960, + "step": 129800 + }, + { + "epoch": 1.2387779195535256, + "grad_norm": 0.1263890564441681, + "learning_rate": 0.0002906701312312861, + "loss": 2.0506, + "num_input_tokens_seen": 68067805312, + "step": 129850 + }, + { + "epoch": 1.239254922069714, + "grad_norm": 0.1265845000743866, + "learning_rate": 0.0002881261029483057, + "loss": 2.0376, + "num_input_tokens_seen": 68094019712, + "step": 129900 + }, + { + "epoch": 1.2397319245859022, + "grad_norm": 0.1379150003194809, + "learning_rate": 0.0002855887427331267, + "loss": 2.0482, + "num_input_tokens_seen": 68120232192, + "step": 129950 + }, + { + "epoch": 1.2402089271020904, + "grad_norm": 0.12455019354820251, + "learning_rate": 0.00028305813044122096, + "loss": 2.038, + "num_input_tokens_seen": 68146442176, + "step": 130000 + }, + { + "epoch": 1.2402089271020904, + "eval_loss": 1.965224266052246, + "eval_runtime": 83.0846, + "eval_samples_per_second": 60.18, + "eval_steps_per_second": 15.045, + "num_input_tokens_seen": 68146442176, + "step": 130000 + }, + { + "epoch": 1.2406859296182788, + "grad_norm": 0.12637196481227875, + "learning_rate": 0.00028053434571568983, + "loss": 2.0543, + "num_input_tokens_seen": 68172655040, + "step": 130050 + }, + { + "epoch": 1.241162932134467, + "grad_norm": 0.1351892203092575, + "learning_rate": 0.000278017467984759, + "loss": 2.0578, + "num_input_tokens_seen": 68198869440, + "step": 130100 + }, + { + "epoch": 1.2416399346506553, + "grad_norm": 0.12203965336084366, + "learning_rate": 0.00027550757645927764, + "loss": 2.0427, + "num_input_tokens_seen": 68225083840, + "step": 130150 + }, + { + "epoch": 1.2421169371668435, + "grad_norm": 0.13395994901657104, + "learning_rate": 0.00027300475013022663, + "loss": 2.0488, + "num_input_tokens_seen": 68251293952, + "step": 130200 + }, + { + "epoch": 1.242593939683032, + "grad_norm": 0.1291465014219284, + "learning_rate": 0.0002705090677662311, + "loss": 2.0484, + "num_input_tokens_seen": 68277498432, + "step": 130250 + }, + { + "epoch": 1.24307094219922, + "grad_norm": 0.12472834438085556, + "learning_rate": 0.000268020607911083, + "loss": 2.0538, + "num_input_tokens_seen": 68303709440, + "step": 130300 + }, + { + "epoch": 1.2435479447154083, + "grad_norm": 0.1263572871685028, + "learning_rate": 0.0002655394488812677, + "loss": 2.0487, + "num_input_tokens_seen": 68329920512, + "step": 130350 + }, + { + "epoch": 1.2440249472315967, + "grad_norm": 0.12614773213863373, + "learning_rate": 0.0002630656687635007, + "loss": 2.053, + "num_input_tokens_seen": 68356112384, + "step": 130400 + }, + { + "epoch": 1.244501949747785, + "grad_norm": 0.1241307333111763, + "learning_rate": 0.0002605993454122687, + "loss": 2.049, + "num_input_tokens_seen": 68382320896, + "step": 130450 + }, + { + "epoch": 1.2449789522639731, + "grad_norm": 0.12764516472816467, + "learning_rate": 0.0002581405564473801, + "loss": 2.0338, + "num_input_tokens_seen": 68408534464, + "step": 130500 + }, + { + "epoch": 1.2449789522639731, + "eval_loss": 1.9643968343734741, + "eval_runtime": 82.7385, + "eval_samples_per_second": 60.431, + "eval_steps_per_second": 15.108, + "num_input_tokens_seen": 68408534464, + "step": 130500 + }, + { + "epoch": 1.2454559547801614, + "grad_norm": 0.1308233141899109, + "learning_rate": 0.0002556893792515227, + "loss": 2.0371, + "num_input_tokens_seen": 68434747040, + "step": 130550 + }, + { + "epoch": 1.2459329572963498, + "grad_norm": 0.12745235860347748, + "learning_rate": 0.00025324589096782657, + "loss": 2.0373, + "num_input_tokens_seen": 68460951616, + "step": 130600 + }, + { + "epoch": 1.246409959812538, + "grad_norm": 0.1278812736272812, + "learning_rate": 0.0002508101684974387, + "loss": 2.0405, + "num_input_tokens_seen": 68487165696, + "step": 130650 + }, + { + "epoch": 1.2468869623287262, + "grad_norm": 0.12204719334840775, + "learning_rate": 0.00024838228849709997, + "loss": 2.0424, + "num_input_tokens_seen": 68513380096, + "step": 130700 + }, + { + "epoch": 1.2473639648449146, + "grad_norm": 0.11976956576108932, + "learning_rate": 0.0002459623273767354, + "loss": 2.0596, + "num_input_tokens_seen": 68539590240, + "step": 130750 + }, + { + "epoch": 1.2478409673611028, + "grad_norm": 0.13120809197425842, + "learning_rate": 0.000243550361297047, + "loss": 2.037, + "num_input_tokens_seen": 68565804640, + "step": 130800 + }, + { + "epoch": 1.248317969877291, + "grad_norm": 0.12905927002429962, + "learning_rate": 0.00024114646616711844, + "loss": 2.0341, + "num_input_tokens_seen": 68592007552, + "step": 130850 + }, + { + "epoch": 1.2487949723934793, + "grad_norm": 0.12697407603263855, + "learning_rate": 0.00023875071764202561, + "loss": 2.05, + "num_input_tokens_seen": 68618221952, + "step": 130900 + }, + { + "epoch": 1.2492719749096677, + "grad_norm": 0.12694934010505676, + "learning_rate": 0.00023636319112045495, + "loss": 2.0436, + "num_input_tokens_seen": 68644425984, + "step": 130950 + }, + { + "epoch": 1.249748977425856, + "grad_norm": 0.1360025703907013, + "learning_rate": 0.00023398396174233177, + "loss": 2.0506, + "num_input_tokens_seen": 68670633664, + "step": 131000 + }, + { + "epoch": 1.249748977425856, + "eval_loss": 1.962631106376648, + "eval_runtime": 82.4327, + "eval_samples_per_second": 60.656, + "eval_steps_per_second": 15.164, + "num_input_tokens_seen": 68670633664, + "step": 131000 + }, + { + "epoch": 1.2502259799420443, + "grad_norm": 0.13041457533836365, + "learning_rate": 0.000231613104386454, + "loss": 2.0362, + "num_input_tokens_seen": 68696842016, + "step": 131050 + }, + { + "epoch": 1.2507029824582325, + "grad_norm": 0.1306309849023819, + "learning_rate": 0.00022925069366813716, + "loss": 2.0593, + "num_input_tokens_seen": 68723054176, + "step": 131100 + }, + { + "epoch": 1.2511799849744207, + "grad_norm": 0.12761172652244568, + "learning_rate": 0.00022689680393686457, + "loss": 2.0496, + "num_input_tokens_seen": 68749263552, + "step": 131150 + }, + { + "epoch": 1.251656987490609, + "grad_norm": 0.12187056988477707, + "learning_rate": 0.0002245515092739488, + "loss": 2.0417, + "num_input_tokens_seen": 68775477952, + "step": 131200 + }, + { + "epoch": 1.2521339900067971, + "grad_norm": 0.12770666182041168, + "learning_rate": 0.00022221488349019903, + "loss": 2.0332, + "num_input_tokens_seen": 68801692352, + "step": 131250 + }, + { + "epoch": 1.2526109925229856, + "grad_norm": 0.13457396626472473, + "learning_rate": 0.00021988700012359863, + "loss": 2.0393, + "num_input_tokens_seen": 68827900832, + "step": 131300 + }, + { + "epoch": 1.2530879950391738, + "grad_norm": 0.12845295667648315, + "learning_rate": 0.0002175679324369913, + "loss": 2.0507, + "num_input_tokens_seen": 68854107328, + "step": 131350 + }, + { + "epoch": 1.2535649975553622, + "grad_norm": 0.12990029156208038, + "learning_rate": 0.00021525775341577403, + "loss": 2.0373, + "num_input_tokens_seen": 68880316256, + "step": 131400 + }, + { + "epoch": 1.2540420000715504, + "grad_norm": 0.12344187498092651, + "learning_rate": 0.00021295653576560165, + "loss": 2.0359, + "num_input_tokens_seen": 68906521376, + "step": 131450 + }, + { + "epoch": 1.2545190025877386, + "grad_norm": 0.12487955391407013, + "learning_rate": 0.00021066435191009715, + "loss": 2.0432, + "num_input_tokens_seen": 68932735776, + "step": 131500 + }, + { + "epoch": 1.2545190025877386, + "eval_loss": 1.9613933563232422, + "eval_runtime": 82.9225, + "eval_samples_per_second": 60.297, + "eval_steps_per_second": 15.074, + "num_input_tokens_seen": 68932735776, + "step": 131500 + }, + { + "epoch": 1.2549960051039268, + "grad_norm": 0.13224980235099792, + "learning_rate": 0.00020838127398857382, + "loss": 2.0413, + "num_input_tokens_seen": 68958946656, + "step": 131550 + }, + { + "epoch": 1.2554730076201153, + "grad_norm": 0.12449366599321365, + "learning_rate": 0.00020610737385376348, + "loss": 2.0503, + "num_input_tokens_seen": 68985155520, + "step": 131600 + }, + { + "epoch": 1.2559500101363035, + "grad_norm": 0.12943805754184723, + "learning_rate": 0.0002038427230695565, + "loss": 2.0476, + "num_input_tokens_seen": 69011368384, + "step": 131650 + }, + { + "epoch": 1.2564270126524917, + "grad_norm": 0.1288331300020218, + "learning_rate": 0.00020158739290874821, + "loss": 2.0458, + "num_input_tokens_seen": 69037580736, + "step": 131700 + }, + { + "epoch": 1.25690401516868, + "grad_norm": 0.12655895948410034, + "learning_rate": 0.00019934145435079704, + "loss": 2.0474, + "num_input_tokens_seen": 69063793760, + "step": 131750 + }, + { + "epoch": 1.2573810176848683, + "grad_norm": 0.1263783723115921, + "learning_rate": 0.0001971049780795901, + "loss": 2.0387, + "num_input_tokens_seen": 69090002496, + "step": 131800 + }, + { + "epoch": 1.2578580202010565, + "grad_norm": 0.13202515244483948, + "learning_rate": 0.0001948780344812181, + "loss": 2.0531, + "num_input_tokens_seen": 69116216896, + "step": 131850 + }, + { + "epoch": 1.2583350227172447, + "grad_norm": 0.12061940133571625, + "learning_rate": 0.00019266069364176142, + "loss": 2.052, + "num_input_tokens_seen": 69142427680, + "step": 131900 + }, + { + "epoch": 1.2588120252334332, + "grad_norm": 0.1222308874130249, + "learning_rate": 0.00019045302534508295, + "loss": 2.0409, + "num_input_tokens_seen": 69168631136, + "step": 131950 + }, + { + "epoch": 1.2592890277496214, + "grad_norm": 0.11664976924657822, + "learning_rate": 0.00018825509907063325, + "loss": 2.0361, + "num_input_tokens_seen": 69194840608, + "step": 132000 + }, + { + "epoch": 1.2592890277496214, + "eval_loss": 1.9602855443954468, + "eval_runtime": 82.6066, + "eval_samples_per_second": 60.528, + "eval_steps_per_second": 15.132, + "num_input_tokens_seen": 69194840608, + "step": 132000 + }, + { + "epoch": 1.2597660302658096, + "grad_norm": 0.11991748213768005, + "learning_rate": 0.0001860669839912626, + "loss": 2.0354, + "num_input_tokens_seen": 69221050496, + "step": 132050 + }, + { + "epoch": 1.260243032781998, + "grad_norm": 0.11859247088432312, + "learning_rate": 0.00018388874897104518, + "loss": 2.0449, + "num_input_tokens_seen": 69247257536, + "step": 132100 + }, + { + "epoch": 1.2607200352981862, + "grad_norm": 0.12269642949104309, + "learning_rate": 0.00018172046256311088, + "loss": 2.0427, + "num_input_tokens_seen": 69273469824, + "step": 132150 + }, + { + "epoch": 1.2611970378143744, + "grad_norm": 0.11893275380134583, + "learning_rate": 0.00017956219300748795, + "loss": 2.0366, + "num_input_tokens_seen": 69299684224, + "step": 132200 + }, + { + "epoch": 1.2616740403305626, + "grad_norm": 0.12191104143857956, + "learning_rate": 0.0001774140082289563, + "loss": 2.0393, + "num_input_tokens_seen": 69325894496, + "step": 132250 + }, + { + "epoch": 1.262151042846751, + "grad_norm": 0.12704069912433624, + "learning_rate": 0.00017527597583490823, + "loss": 2.0551, + "num_input_tokens_seen": 69352101952, + "step": 132300 + }, + { + "epoch": 1.2626280453629393, + "grad_norm": 0.12682849168777466, + "learning_rate": 0.00017314816311322218, + "loss": 2.0376, + "num_input_tokens_seen": 69378314752, + "step": 132350 + }, + { + "epoch": 1.2631050478791277, + "grad_norm": 0.1246429830789566, + "learning_rate": 0.00017103063703014372, + "loss": 2.0402, + "num_input_tokens_seen": 69404523776, + "step": 132400 + }, + { + "epoch": 1.263582050395316, + "grad_norm": 0.12006555497646332, + "learning_rate": 0.00016892346422817944, + "loss": 2.0383, + "num_input_tokens_seen": 69430732160, + "step": 132450 + }, + { + "epoch": 1.264059052911504, + "grad_norm": 0.12435656785964966, + "learning_rate": 0.00016682671102399805, + "loss": 2.0347, + "num_input_tokens_seen": 69456943424, + "step": 132500 + }, + { + "epoch": 1.264059052911504, + "eval_loss": 1.9590063095092773, + "eval_runtime": 82.7888, + "eval_samples_per_second": 60.395, + "eval_steps_per_second": 15.099, + "num_input_tokens_seen": 69456943424, + "step": 132500 + }, + { + "epoch": 1.2645360554276923, + "grad_norm": 0.12412598729133606, + "learning_rate": 0.0001647404434063447, + "loss": 2.0436, + "num_input_tokens_seen": 69483146688, + "step": 132550 + }, + { + "epoch": 1.2650130579438805, + "grad_norm": 0.12309623509645462, + "learning_rate": 0.00016266472703396284, + "loss": 2.028, + "num_input_tokens_seen": 69509359968, + "step": 132600 + }, + { + "epoch": 1.265490060460069, + "grad_norm": 0.12758532166481018, + "learning_rate": 0.0001605996272335291, + "loss": 2.041, + "num_input_tokens_seen": 69535568960, + "step": 132650 + }, + { + "epoch": 1.2659670629762572, + "grad_norm": 0.11922606080770493, + "learning_rate": 0.00015854520899759655, + "loss": 2.0308, + "num_input_tokens_seen": 69561777024, + "step": 132700 + }, + { + "epoch": 1.2664440654924456, + "grad_norm": 0.1239946112036705, + "learning_rate": 0.00015650153698254916, + "loss": 2.0336, + "num_input_tokens_seen": 69587981952, + "step": 132750 + }, + { + "epoch": 1.2669210680086338, + "grad_norm": 0.12584541738033295, + "learning_rate": 0.00015446867550656767, + "loss": 2.0376, + "num_input_tokens_seen": 69614192832, + "step": 132800 + }, + { + "epoch": 1.267398070524822, + "grad_norm": 0.12514598667621613, + "learning_rate": 0.00015244668854760458, + "loss": 2.0411, + "num_input_tokens_seen": 69640405600, + "step": 132850 + }, + { + "epoch": 1.2678750730410102, + "grad_norm": 0.12181352823972702, + "learning_rate": 0.00015043563974137132, + "loss": 2.0404, + "num_input_tokens_seen": 69666619040, + "step": 132900 + }, + { + "epoch": 1.2683520755571986, + "grad_norm": 0.11871461570262909, + "learning_rate": 0.00014843559237933475, + "loss": 2.0458, + "num_input_tokens_seen": 69692833440, + "step": 132950 + }, + { + "epoch": 1.2688290780733869, + "grad_norm": 0.12271245568990707, + "learning_rate": 0.00014644660940672628, + "loss": 2.0354, + "num_input_tokens_seen": 69719047840, + "step": 133000 + }, + { + "epoch": 1.2688290780733869, + "eval_loss": 1.9576880931854248, + "eval_runtime": 82.558, + "eval_samples_per_second": 60.564, + "eval_steps_per_second": 15.141, + "num_input_tokens_seen": 69719047840, + "step": 133000 + }, + { + "epoch": 1.269306080589575, + "grad_norm": 0.12358897924423218, + "learning_rate": 0.00014446875342055988, + "loss": 2.0342, + "num_input_tokens_seen": 69745262240, + "step": 133050 + }, + { + "epoch": 1.2697830831057635, + "grad_norm": 0.12031599134206772, + "learning_rate": 0.00014250208666766236, + "loss": 2.0402, + "num_input_tokens_seen": 69771476640, + "step": 133100 + }, + { + "epoch": 1.2702600856219517, + "grad_norm": 0.12011140584945679, + "learning_rate": 0.00014054667104271496, + "loss": 2.0358, + "num_input_tokens_seen": 69797691040, + "step": 133150 + }, + { + "epoch": 1.27073708813814, + "grad_norm": 0.12352379411458969, + "learning_rate": 0.00013860256808630427, + "loss": 2.043, + "num_input_tokens_seen": 69823902816, + "step": 133200 + }, + { + "epoch": 1.271214090654328, + "grad_norm": 0.1257781833410263, + "learning_rate": 0.00013666983898298656, + "loss": 2.0464, + "num_input_tokens_seen": 69850112224, + "step": 133250 + }, + { + "epoch": 1.2716910931705165, + "grad_norm": 0.12694838643074036, + "learning_rate": 0.00013474854455936125, + "loss": 2.0401, + "num_input_tokens_seen": 69876325568, + "step": 133300 + }, + { + "epoch": 1.2721680956867047, + "grad_norm": 0.12634819746017456, + "learning_rate": 0.00013283874528215734, + "loss": 2.0339, + "num_input_tokens_seen": 69902536928, + "step": 133350 + }, + { + "epoch": 1.272645098202893, + "grad_norm": 0.12307710945606232, + "learning_rate": 0.00013094050125632973, + "loss": 2.0277, + "num_input_tokens_seen": 69928748288, + "step": 133400 + }, + { + "epoch": 1.2731221007190814, + "grad_norm": 0.12187953293323517, + "learning_rate": 0.00012905387222316822, + "loss": 2.0402, + "num_input_tokens_seen": 69954953888, + "step": 133450 + }, + { + "epoch": 1.2735991032352696, + "grad_norm": 0.12032655626535416, + "learning_rate": 0.0001271789175584172, + "loss": 2.0419, + "num_input_tokens_seen": 69981165632, + "step": 133500 + }, + { + "epoch": 1.2735991032352696, + "eval_loss": 1.9568681716918945, + "eval_runtime": 82.7406, + "eval_samples_per_second": 60.43, + "eval_steps_per_second": 15.107, + "num_input_tokens_seen": 69981165632, + "step": 133500 + }, + { + "epoch": 1.2740761057514578, + "grad_norm": 0.12817110121250153, + "learning_rate": 0.00012531569627040635, + "loss": 2.034, + "num_input_tokens_seen": 70007368800, + "step": 133550 + }, + { + "epoch": 1.274553108267646, + "grad_norm": 0.13095012307167053, + "learning_rate": 0.00012346426699819457, + "loss": 2.0346, + "num_input_tokens_seen": 70033578048, + "step": 133600 + }, + { + "epoch": 1.2750301107838344, + "grad_norm": 0.12582357227802277, + "learning_rate": 0.00012162468800972342, + "loss": 2.0398, + "num_input_tokens_seen": 70059792448, + "step": 133650 + }, + { + "epoch": 1.2755071133000226, + "grad_norm": 0.11612017452716827, + "learning_rate": 0.00011979701719998454, + "loss": 2.0341, + "num_input_tokens_seen": 70086003648, + "step": 133700 + }, + { + "epoch": 1.2759841158162109, + "grad_norm": 0.12256049364805222, + "learning_rate": 0.00011798131208919626, + "loss": 2.029, + "num_input_tokens_seen": 70112204096, + "step": 133750 + }, + { + "epoch": 1.2764611183323993, + "grad_norm": 0.11747635900974274, + "learning_rate": 0.00011617762982099444, + "loss": 2.0355, + "num_input_tokens_seen": 70138411104, + "step": 133800 + }, + { + "epoch": 1.2769381208485875, + "grad_norm": 0.12225272506475449, + "learning_rate": 0.00011438602716063329, + "loss": 2.042, + "num_input_tokens_seen": 70164623328, + "step": 133850 + }, + { + "epoch": 1.2774151233647757, + "grad_norm": 0.1293225735425949, + "learning_rate": 0.00011260656049319957, + "loss": 2.0367, + "num_input_tokens_seen": 70190833888, + "step": 133900 + }, + { + "epoch": 1.277892125880964, + "grad_norm": 0.12261593341827393, + "learning_rate": 0.0001108392858218371, + "loss": 2.0444, + "num_input_tokens_seen": 70217043648, + "step": 133950 + }, + { + "epoch": 1.2783691283971523, + "grad_norm": 0.11957214772701263, + "learning_rate": 0.0001090842587659851, + "loss": 2.0345, + "num_input_tokens_seen": 70243253472, + "step": 134000 + }, + { + "epoch": 1.2783691283971523, + "eval_loss": 1.955412745475769, + "eval_runtime": 82.5981, + "eval_samples_per_second": 60.534, + "eval_steps_per_second": 15.134, + "num_input_tokens_seen": 70243253472, + "step": 134000 + }, + { + "epoch": 1.2788461309133405, + "grad_norm": 0.12490282952785492, + "learning_rate": 0.00010734153455962764, + "loss": 2.0308, + "num_input_tokens_seen": 70269466208, + "step": 134050 + }, + { + "epoch": 1.279323133429529, + "grad_norm": 0.12396061420440674, + "learning_rate": 0.00010561116804955451, + "loss": 2.036, + "num_input_tokens_seen": 70295676096, + "step": 134100 + }, + { + "epoch": 1.2798001359457172, + "grad_norm": 0.12122515588998795, + "learning_rate": 0.00010389321369363636, + "loss": 2.0424, + "num_input_tokens_seen": 70321882272, + "step": 134150 + }, + { + "epoch": 1.2802771384619054, + "grad_norm": 0.12559206783771515, + "learning_rate": 0.00010218772555910954, + "loss": 2.0456, + "num_input_tokens_seen": 70348095808, + "step": 134200 + }, + { + "epoch": 1.2807541409780936, + "grad_norm": 0.11915505677461624, + "learning_rate": 0.0001004947573208756, + "loss": 2.0412, + "num_input_tokens_seen": 70374304800, + "step": 134250 + }, + { + "epoch": 1.2812311434942818, + "grad_norm": 0.12196268141269684, + "learning_rate": 9.881436225981105e-05, + "loss": 2.0386, + "num_input_tokens_seen": 70400510976, + "step": 134300 + }, + { + "epoch": 1.2817081460104702, + "grad_norm": 0.12415535002946854, + "learning_rate": 9.714659326109137e-05, + "loss": 2.0448, + "num_input_tokens_seen": 70426725376, + "step": 134350 + }, + { + "epoch": 1.2821851485266584, + "grad_norm": 0.12361661344766617, + "learning_rate": 9.549150281252633e-05, + "loss": 2.0371, + "num_input_tokens_seen": 70452929792, + "step": 134400 + }, + { + "epoch": 1.2826621510428469, + "grad_norm": 0.12377167493104935, + "learning_rate": 9.384914300290748e-05, + "loss": 2.0344, + "num_input_tokens_seen": 70479144192, + "step": 134450 + }, + { + "epoch": 1.283139153559035, + "grad_norm": 0.11863281577825546, + "learning_rate": 9.221956552036992e-05, + "loss": 2.0393, + "num_input_tokens_seen": 70505353504, + "step": 134500 + }, + { + "epoch": 1.283139153559035, + "eval_loss": 1.9545812606811523, + "eval_runtime": 82.3767, + "eval_samples_per_second": 60.697, + "eval_steps_per_second": 15.174, + "num_input_tokens_seen": 70505353504, + "step": 134500 + }, + { + "epoch": 1.2836161560752233, + "grad_norm": 0.12550202012062073, + "learning_rate": 9.060282165076461e-05, + "loss": 2.0483, + "num_input_tokens_seen": 70531564640, + "step": 134550 + }, + { + "epoch": 1.2840931585914115, + "grad_norm": 0.12165137380361557, + "learning_rate": 8.899896227604509e-05, + "loss": 2.034, + "num_input_tokens_seen": 70557777824, + "step": 134600 + }, + { + "epoch": 1.2845701611076, + "grad_norm": 0.12417840212583542, + "learning_rate": 8.740803787266521e-05, + "loss": 2.0381, + "num_input_tokens_seen": 70583987456, + "step": 134650 + }, + { + "epoch": 1.2850471636237881, + "grad_norm": 0.12609820067882538, + "learning_rate": 8.58300985099918e-05, + "loss": 2.0369, + "num_input_tokens_seen": 70610189152, + "step": 134700 + }, + { + "epoch": 1.2855241661399763, + "grad_norm": 0.1163376122713089, + "learning_rate": 8.426519384872733e-05, + "loss": 2.0236, + "num_input_tokens_seen": 70636401088, + "step": 134750 + }, + { + "epoch": 1.2860011686561648, + "grad_norm": 0.11958843469619751, + "learning_rate": 8.271337313934868e-05, + "loss": 2.0465, + "num_input_tokens_seen": 70662608672, + "step": 134800 + }, + { + "epoch": 1.286478171172353, + "grad_norm": 0.12234240025281906, + "learning_rate": 8.117468522055577e-05, + "loss": 2.0384, + "num_input_tokens_seen": 70688820640, + "step": 134850 + }, + { + "epoch": 1.2869551736885412, + "grad_norm": 0.11501733213663101, + "learning_rate": 7.964917851773496e-05, + "loss": 2.0343, + "num_input_tokens_seen": 70715035040, + "step": 134900 + }, + { + "epoch": 1.2874321762047294, + "grad_norm": 0.12062328308820724, + "learning_rate": 7.813690104143555e-05, + "loss": 2.0211, + "num_input_tokens_seen": 70741249088, + "step": 134950 + }, + { + "epoch": 1.2879091787209178, + "grad_norm": 0.11405592411756516, + "learning_rate": 7.663790038585794e-05, + "loss": 2.0401, + "num_input_tokens_seen": 70767457344, + "step": 135000 + }, + { + "epoch": 1.2879091787209178, + "eval_loss": 1.9541493654251099, + "eval_runtime": 82.5619, + "eval_samples_per_second": 60.561, + "eval_steps_per_second": 15.14, + "num_input_tokens_seen": 70767457344, + "step": 135000 + }, + { + "epoch": 1.288386181237106, + "grad_norm": 0.1237749382853508, + "learning_rate": 7.515222372735647e-05, + "loss": 2.029, + "num_input_tokens_seen": 70793671744, + "step": 135050 + }, + { + "epoch": 1.2888631837532942, + "grad_norm": 0.11638092249631882, + "learning_rate": 7.367991782295391e-05, + "loss": 2.0171, + "num_input_tokens_seen": 70819879168, + "step": 135100 + }, + { + "epoch": 1.2893401862694827, + "grad_norm": 0.11938998103141785, + "learning_rate": 7.222102900887101e-05, + "loss": 2.0232, + "num_input_tokens_seen": 70846079616, + "step": 135150 + }, + { + "epoch": 1.2898171887856709, + "grad_norm": 0.11985292285680771, + "learning_rate": 7.077560319906695e-05, + "loss": 2.0387, + "num_input_tokens_seen": 70872294016, + "step": 135200 + }, + { + "epoch": 1.290294191301859, + "grad_norm": 0.12651756405830383, + "learning_rate": 6.934368588379552e-05, + "loss": 2.0345, + "num_input_tokens_seen": 70898498624, + "step": 135250 + }, + { + "epoch": 1.2907711938180473, + "grad_norm": 0.12012086063623428, + "learning_rate": 6.792532212817271e-05, + "loss": 2.0362, + "num_input_tokens_seen": 70924710048, + "step": 135300 + }, + { + "epoch": 1.2912481963342357, + "grad_norm": 0.12295469641685486, + "learning_rate": 6.652055657075845e-05, + "loss": 2.0338, + "num_input_tokens_seen": 70950915200, + "step": 135350 + }, + { + "epoch": 1.291725198850424, + "grad_norm": 0.12192966043949127, + "learning_rate": 6.512943342215233e-05, + "loss": 2.0311, + "num_input_tokens_seen": 70977118208, + "step": 135400 + }, + { + "epoch": 1.2922022013666123, + "grad_norm": 0.1188386008143425, + "learning_rate": 6.375199646360142e-05, + "loss": 2.0311, + "num_input_tokens_seen": 71003331520, + "step": 135450 + }, + { + "epoch": 1.2926792038828006, + "grad_norm": 0.11646123230457306, + "learning_rate": 6.238828904562316e-05, + "loss": 2.037, + "num_input_tokens_seen": 71029545920, + "step": 135500 + }, + { + "epoch": 1.2926792038828006, + "eval_loss": 1.9530843496322632, + "eval_runtime": 82.2362, + "eval_samples_per_second": 60.8, + "eval_steps_per_second": 15.2, + "num_input_tokens_seen": 71029545920, + "step": 135500 + }, + { + "epoch": 1.2931562063989888, + "grad_norm": 0.12359626591205597, + "learning_rate": 6.103835408664032e-05, + "loss": 2.0441, + "num_input_tokens_seen": 71055753312, + "step": 135550 + }, + { + "epoch": 1.293633208915177, + "grad_norm": 0.12097882479429245, + "learning_rate": 5.9702234071631e-05, + "loss": 2.0251, + "num_input_tokens_seen": 71081964480, + "step": 135600 + }, + { + "epoch": 1.2941102114313652, + "grad_norm": 0.11585067212581635, + "learning_rate": 5.83799710507909e-05, + "loss": 2.0352, + "num_input_tokens_seen": 71108163424, + "step": 135650 + }, + { + "epoch": 1.2945872139475536, + "grad_norm": 0.12164249271154404, + "learning_rate": 5.7071606638210094e-05, + "loss": 2.0314, + "num_input_tokens_seen": 71134375424, + "step": 135700 + }, + { + "epoch": 1.2950642164637418, + "grad_norm": 0.11601755023002625, + "learning_rate": 5.577718201056392e-05, + "loss": 2.0313, + "num_input_tokens_seen": 71160582688, + "step": 135750 + }, + { + "epoch": 1.2955412189799302, + "grad_norm": 0.11863810569047928, + "learning_rate": 5.449673790581611e-05, + "loss": 2.036, + "num_input_tokens_seen": 71186792800, + "step": 135800 + }, + { + "epoch": 1.2960182214961184, + "grad_norm": 0.12455905973911285, + "learning_rate": 5.3230314621937556e-05, + "loss": 2.0316, + "num_input_tokens_seen": 71213000416, + "step": 135850 + }, + { + "epoch": 1.2964952240123067, + "grad_norm": 0.11861378699541092, + "learning_rate": 5.197795201563743e-05, + "loss": 2.0334, + "num_input_tokens_seen": 71239212224, + "step": 135900 + }, + { + "epoch": 1.2969722265284949, + "grad_norm": 0.11894825845956802, + "learning_rate": 5.073968950110941e-05, + "loss": 2.028, + "num_input_tokens_seen": 71265425728, + "step": 135950 + }, + { + "epoch": 1.297449229044683, + "grad_norm": 0.11746333539485931, + "learning_rate": 4.9515566048790485e-05, + "loss": 2.0302, + "num_input_tokens_seen": 71291638272, + "step": 136000 + }, + { + "epoch": 1.297449229044683, + "eval_loss": 1.9527229070663452, + "eval_runtime": 82.9319, + "eval_samples_per_second": 60.29, + "eval_steps_per_second": 15.073, + "num_input_tokens_seen": 71291638272, + "step": 136000 + }, + { + "epoch": 1.2979262315608715, + "grad_norm": 0.1190498098731041, + "learning_rate": 4.8305620184135315e-05, + "loss": 2.0321, + "num_input_tokens_seen": 71317844512, + "step": 136050 + }, + { + "epoch": 1.2984032340770597, + "grad_norm": 0.11770997196435928, + "learning_rate": 4.7109889986402973e-05, + "loss": 2.0341, + "num_input_tokens_seen": 71344050560, + "step": 136100 + }, + { + "epoch": 1.2988802365932481, + "grad_norm": 0.11683844774961472, + "learning_rate": 4.592841308745932e-05, + "loss": 2.0243, + "num_input_tokens_seen": 71370258656, + "step": 136150 + }, + { + "epoch": 1.2993572391094363, + "grad_norm": 0.12114414572715759, + "learning_rate": 4.476122667059207e-05, + "loss": 2.0379, + "num_input_tokens_seen": 71396470656, + "step": 136200 + }, + { + "epoch": 1.2998342416256246, + "grad_norm": 0.11975762993097305, + "learning_rate": 4.3608367469340547e-05, + "loss": 2.0359, + "num_input_tokens_seen": 71422685056, + "step": 136250 + }, + { + "epoch": 1.3003112441418128, + "grad_norm": 0.11278797686100006, + "learning_rate": 4.2469871766340095e-05, + "loss": 2.0219, + "num_input_tokens_seen": 71448892928, + "step": 136300 + }, + { + "epoch": 1.3007882466580012, + "grad_norm": 0.11854268610477448, + "learning_rate": 4.1345775392179654e-05, + "loss": 2.0404, + "num_input_tokens_seen": 71475094528, + "step": 136350 + }, + { + "epoch": 1.3012652491741894, + "grad_norm": 0.11631016433238983, + "learning_rate": 4.0236113724274713e-05, + "loss": 2.0301, + "num_input_tokens_seen": 71501303968, + "step": 136400 + }, + { + "epoch": 1.3017422516903776, + "grad_norm": 0.11170602589845657, + "learning_rate": 3.9140921685753064e-05, + "loss": 2.0431, + "num_input_tokens_seen": 71527518368, + "step": 136450 + }, + { + "epoch": 1.302219254206566, + "grad_norm": 0.11311063915491104, + "learning_rate": 3.806023374435663e-05, + "loss": 2.0173, + "num_input_tokens_seen": 71553726688, + "step": 136500 + }, + { + "epoch": 1.302219254206566, + "eval_loss": 1.9524949789047241, + "eval_runtime": 83.0874, + "eval_samples_per_second": 60.178, + "eval_steps_per_second": 15.044, + "num_input_tokens_seen": 71553726688, + "step": 136500 + }, + { + "epoch": 1.3026962567227542, + "grad_norm": 0.728589653968811, + "learning_rate": 3.699408391135611e-05, + "loss": 2.0415, + "num_input_tokens_seen": 71579934304, + "step": 136550 + }, + { + "epoch": 1.3031732592389424, + "grad_norm": 0.11253057420253754, + "learning_rate": 3.594250574048058e-05, + "loss": 2.0334, + "num_input_tokens_seen": 71606145184, + "step": 136600 + }, + { + "epoch": 1.3036502617551307, + "grad_norm": 0.12201691418886185, + "learning_rate": 3.4905532326861944e-05, + "loss": 2.0403, + "num_input_tokens_seen": 71632351648, + "step": 136650 + }, + { + "epoch": 1.304127264271319, + "grad_norm": 0.11976749449968338, + "learning_rate": 3.3883196305992905e-05, + "loss": 2.0292, + "num_input_tokens_seen": 71658566048, + "step": 136700 + }, + { + "epoch": 1.3046042667875073, + "grad_norm": 0.12131944298744202, + "learning_rate": 3.2875529852700146e-05, + "loss": 2.0405, + "num_input_tokens_seen": 71684775808, + "step": 136750 + }, + { + "epoch": 1.3050812693036955, + "grad_norm": 0.11625051498413086, + "learning_rate": 3.18825646801314e-05, + "loss": 2.0392, + "num_input_tokens_seen": 71710990048, + "step": 136800 + }, + { + "epoch": 1.305558271819884, + "grad_norm": 0.11870067566633224, + "learning_rate": 3.0904332038757974e-05, + "loss": 2.0388, + "num_input_tokens_seen": 71737198176, + "step": 136850 + }, + { + "epoch": 1.3060352743360721, + "grad_norm": 0.11490604281425476, + "learning_rate": 2.994086271539048e-05, + "loss": 2.0261, + "num_input_tokens_seen": 71763409248, + "step": 136900 + }, + { + "epoch": 1.3065122768522603, + "grad_norm": 0.1218944787979126, + "learning_rate": 2.8992187032210516e-05, + "loss": 2.0421, + "num_input_tokens_seen": 71789610880, + "step": 136950 + }, + { + "epoch": 1.3069892793684486, + "grad_norm": 0.11681609600782394, + "learning_rate": 2.8058334845816213e-05, + "loss": 2.0287, + "num_input_tokens_seen": 71815816608, + "step": 137000 + }, + { + "epoch": 1.3069892793684486, + "eval_loss": 1.951898455619812, + "eval_runtime": 82.7779, + "eval_samples_per_second": 60.403, + "eval_steps_per_second": 15.101, + "num_input_tokens_seen": 71815816608, + "step": 137000 + }, + { + "epoch": 1.307466281884637, + "grad_norm": 0.11646866798400879, + "learning_rate": 2.7139335546282283e-05, + "loss": 2.0325, + "num_input_tokens_seen": 71842030368, + "step": 137050 + }, + { + "epoch": 1.3079432844008252, + "grad_norm": 0.10989837348461151, + "learning_rate": 2.6235218056235634e-05, + "loss": 2.0325, + "num_input_tokens_seen": 71868244768, + "step": 137100 + }, + { + "epoch": 1.3084202869170136, + "grad_norm": 0.11658209562301636, + "learning_rate": 2.5346010829944367e-05, + "loss": 2.0289, + "num_input_tokens_seen": 71894452160, + "step": 137150 + }, + { + "epoch": 1.3088972894332018, + "grad_norm": 0.11487242579460144, + "learning_rate": 2.4471741852423235e-05, + "loss": 2.0322, + "num_input_tokens_seen": 71920664928, + "step": 137200 + }, + { + "epoch": 1.30937429194939, + "grad_norm": 0.11544458568096161, + "learning_rate": 2.3612438638551835e-05, + "loss": 2.0279, + "num_input_tokens_seen": 71946876896, + "step": 137250 + }, + { + "epoch": 1.3098512944655782, + "grad_norm": 0.11500503867864609, + "learning_rate": 2.276812823220964e-05, + "loss": 2.0399, + "num_input_tokens_seen": 71973091200, + "step": 137300 + }, + { + "epoch": 1.3103282969817664, + "grad_norm": 0.11575910449028015, + "learning_rate": 2.1938837205424e-05, + "loss": 2.0246, + "num_input_tokens_seen": 71999300832, + "step": 137350 + }, + { + "epoch": 1.3108052994979549, + "grad_norm": 0.1175985336303711, + "learning_rate": 2.1124591657534777e-05, + "loss": 2.0225, + "num_input_tokens_seen": 72025515232, + "step": 137400 + }, + { + "epoch": 1.311282302014143, + "grad_norm": 0.11688115447759628, + "learning_rate": 2.032541721437209e-05, + "loss": 2.024, + "num_input_tokens_seen": 72051723040, + "step": 137450 + }, + { + "epoch": 1.3117593045303315, + "grad_norm": 0.11419174075126648, + "learning_rate": 1.9541339027450256e-05, + "loss": 2.0254, + "num_input_tokens_seen": 72077935168, + "step": 137500 + }, + { + "epoch": 1.3117593045303315, + "eval_loss": 1.951472282409668, + "eval_runtime": 83.1149, + "eval_samples_per_second": 60.158, + "eval_steps_per_second": 15.039, + "num_input_tokens_seen": 72077935168, + "step": 137500 + }, + { + "epoch": 1.3122363070465197, + "grad_norm": 0.11731937527656555, + "learning_rate": 1.8772381773176416e-05, + "loss": 2.0368, + "num_input_tokens_seen": 72104145664, + "step": 137550 + }, + { + "epoch": 1.312713309562708, + "grad_norm": 0.11281976848840714, + "learning_rate": 1.801856965207338e-05, + "loss": 2.0243, + "num_input_tokens_seen": 72130351488, + "step": 137600 + }, + { + "epoch": 1.3131903120788961, + "grad_norm": 0.12566816806793213, + "learning_rate": 1.7279926388018564e-05, + "loss": 2.0266, + "num_input_tokens_seen": 72156564000, + "step": 137650 + }, + { + "epoch": 1.3136673145950846, + "grad_norm": 0.1202327162027359, + "learning_rate": 1.6556475227496815e-05, + "loss": 2.0344, + "num_input_tokens_seen": 72182768800, + "step": 137700 + }, + { + "epoch": 1.3141443171112728, + "grad_norm": 0.11209400743246078, + "learning_rate": 1.584823893886933e-05, + "loss": 2.0307, + "num_input_tokens_seen": 72208977472, + "step": 137750 + }, + { + "epoch": 1.314621319627461, + "grad_norm": 0.11281031370162964, + "learning_rate": 1.5155239811656562e-05, + "loss": 2.0285, + "num_input_tokens_seen": 72235186752, + "step": 137800 + }, + { + "epoch": 1.3150983221436494, + "grad_norm": 0.11977609992027283, + "learning_rate": 1.4477499655837278e-05, + "loss": 2.0307, + "num_input_tokens_seen": 72261390432, + "step": 137850 + }, + { + "epoch": 1.3155753246598376, + "grad_norm": 0.11602313071489334, + "learning_rate": 1.3815039801161721e-05, + "loss": 2.0272, + "num_input_tokens_seen": 72287596960, + "step": 137900 + }, + { + "epoch": 1.3160523271760258, + "grad_norm": 0.11629103124141693, + "learning_rate": 1.3167881096480372e-05, + "loss": 2.0426, + "num_input_tokens_seen": 72313806912, + "step": 137950 + }, + { + "epoch": 1.316529329692214, + "grad_norm": 0.11337430030107498, + "learning_rate": 1.2536043909088191e-05, + "loss": 2.0286, + "num_input_tokens_seen": 72340003200, + "step": 138000 + }, + { + "epoch": 1.316529329692214, + "eval_loss": 1.9512444734573364, + "eval_runtime": 82.1325, + "eval_samples_per_second": 60.877, + "eval_steps_per_second": 15.219, + "num_input_tokens_seen": 72340003200, + "step": 138000 + }, + { + "epoch": 1.3170063322084025, + "grad_norm": 0.11734651029109955, + "learning_rate": 1.191954812408308e-05, + "loss": 2.0241, + "num_input_tokens_seen": 72366217600, + "step": 138050 + }, + { + "epoch": 1.3174833347245907, + "grad_norm": 0.11315104365348816, + "learning_rate": 1.1318413143740436e-05, + "loss": 2.0195, + "num_input_tokens_seen": 72392425632, + "step": 138100 + }, + { + "epoch": 1.3179603372407789, + "grad_norm": 0.11212780326604843, + "learning_rate": 1.0732657886902309e-05, + "loss": 2.0379, + "num_input_tokens_seen": 72418637536, + "step": 138150 + }, + { + "epoch": 1.3184373397569673, + "grad_norm": 0.11390957236289978, + "learning_rate": 1.0162300788382261e-05, + "loss": 2.0245, + "num_input_tokens_seen": 72444850752, + "step": 138200 + }, + { + "epoch": 1.3189143422731555, + "grad_norm": 0.11521212011575699, + "learning_rate": 9.607359798384786e-06, + "loss": 2.0313, + "num_input_tokens_seen": 72471060032, + "step": 138250 + }, + { + "epoch": 1.3193913447893437, + "grad_norm": 0.11375854164361954, + "learning_rate": 9.0678523819408e-06, + "loss": 2.0313, + "num_input_tokens_seen": 72497274432, + "step": 138300 + }, + { + "epoch": 1.319868347305532, + "grad_norm": 0.11399056017398834, + "learning_rate": 8.543795518357766e-06, + "loss": 2.0256, + "num_input_tokens_seen": 72523485952, + "step": 138350 + }, + { + "epoch": 1.3203453498217204, + "grad_norm": 0.11128194630146027, + "learning_rate": 8.035205700685167e-06, + "loss": 2.0338, + "num_input_tokens_seen": 72549700352, + "step": 138400 + }, + { + "epoch": 1.3208223523379086, + "grad_norm": 0.11179857701063156, + "learning_rate": 7.542098935195918e-06, + "loss": 2.0362, + "num_input_tokens_seen": 72575912992, + "step": 138450 + }, + { + "epoch": 1.3212993548540968, + "grad_norm": 0.11500924825668335, + "learning_rate": 7.064490740882057e-06, + "loss": 2.0285, + "num_input_tokens_seen": 72602127392, + "step": 138500 + }, + { + "epoch": 1.3212993548540968, + "eval_loss": 1.951123833656311, + "eval_runtime": 82.6672, + "eval_samples_per_second": 60.484, + "eval_steps_per_second": 15.121, + "num_input_tokens_seen": 72602127392, + "step": 138500 + }, + { + "epoch": 1.3217763573702852, + "grad_norm": 0.1176285520195961, + "learning_rate": 6.602396148966794e-06, + "loss": 2.0295, + "num_input_tokens_seen": 72628340704, + "step": 138550 + }, + { + "epoch": 1.3222533598864734, + "grad_norm": 0.11359469592571259, + "learning_rate": 6.15582970243117e-06, + "loss": 2.0206, + "num_input_tokens_seen": 72654548704, + "step": 138600 + }, + { + "epoch": 1.3227303624026616, + "grad_norm": 0.11230379343032837, + "learning_rate": 5.72480545555637e-06, + "loss": 2.0285, + "num_input_tokens_seen": 72680760704, + "step": 138650 + }, + { + "epoch": 1.3232073649188498, + "grad_norm": 0.11325126886367798, + "learning_rate": 5.309336973481682e-06, + "loss": 2.0316, + "num_input_tokens_seen": 72706975104, + "step": 138700 + }, + { + "epoch": 1.3236843674350383, + "grad_norm": 0.11530512571334839, + "learning_rate": 4.909437331777178e-06, + "loss": 2.0295, + "num_input_tokens_seen": 72733189504, + "step": 138750 + }, + { + "epoch": 1.3241613699512265, + "grad_norm": 0.11637042462825775, + "learning_rate": 4.52511911603265e-06, + "loss": 2.0358, + "num_input_tokens_seen": 72759403904, + "step": 138800 + }, + { + "epoch": 1.324638372467415, + "grad_norm": 0.11307495832443237, + "learning_rate": 4.15639442146093e-06, + "loss": 2.0256, + "num_input_tokens_seen": 72785609280, + "step": 138850 + }, + { + "epoch": 1.325115374983603, + "grad_norm": 0.11408944427967072, + "learning_rate": 3.803274852517968e-06, + "loss": 2.0432, + "num_input_tokens_seen": 72811823680, + "step": 138900 + }, + { + "epoch": 1.3255923774997913, + "grad_norm": 0.11304306238889694, + "learning_rate": 3.4657715225368535e-06, + "loss": 2.0342, + "num_input_tokens_seen": 72838035008, + "step": 138950 + }, + { + "epoch": 1.3260693800159795, + "grad_norm": 0.11682960391044617, + "learning_rate": 3.143895053378698e-06, + "loss": 2.0353, + "num_input_tokens_seen": 72864248896, + "step": 139000 + }, + { + "epoch": 1.3260693800159795, + "eval_loss": 1.9510550498962402, + "eval_runtime": 82.5623, + "eval_samples_per_second": 60.56, + "eval_steps_per_second": 15.14, + "num_input_tokens_seen": 72864248896, + "step": 139000 + }, + { + "epoch": 1.3265463825321677, + "grad_norm": 0.11243559420108795, + "learning_rate": 2.837655575097964e-06, + "loss": 2.0318, + "num_input_tokens_seen": 72890458688, + "step": 139050 + }, + { + "epoch": 1.3270233850483562, + "grad_norm": 0.11617834120988846, + "learning_rate": 2.547062725623828e-06, + "loss": 2.0384, + "num_input_tokens_seen": 72916673088, + "step": 139100 + }, + { + "epoch": 1.3275003875645444, + "grad_norm": 0.11737903952598572, + "learning_rate": 2.2721256504567023e-06, + "loss": 2.0235, + "num_input_tokens_seen": 72942884768, + "step": 139150 + }, + { + "epoch": 1.3279773900807328, + "grad_norm": 0.10866422206163406, + "learning_rate": 2.012853002380466e-06, + "loss": 2.024, + "num_input_tokens_seen": 72969088544, + "step": 139200 + }, + { + "epoch": 1.328454392596921, + "grad_norm": 0.11547800898551941, + "learning_rate": 1.769252941190458e-06, + "loss": 2.0323, + "num_input_tokens_seen": 72995301472, + "step": 139250 + }, + { + "epoch": 1.3289313951131092, + "grad_norm": 0.11617856472730637, + "learning_rate": 1.541333133436018e-06, + "loss": 2.0294, + "num_input_tokens_seen": 73021507392, + "step": 139300 + }, + { + "epoch": 1.3294083976292974, + "grad_norm": 0.11435816437005997, + "learning_rate": 1.3291007521799014e-06, + "loss": 2.0288, + "num_input_tokens_seen": 73047719968, + "step": 139350 + }, + { + "epoch": 1.3298854001454858, + "grad_norm": 0.11262206733226776, + "learning_rate": 1.132562476771959e-06, + "loss": 2.0301, + "num_input_tokens_seen": 73073924576, + "step": 139400 + }, + { + "epoch": 1.330362402661674, + "grad_norm": 0.11383078992366791, + "learning_rate": 9.517244926393609e-07, + "loss": 2.0187, + "num_input_tokens_seen": 73100138976, + "step": 139450 + }, + { + "epoch": 1.3308394051778623, + "grad_norm": 0.1159028634428978, + "learning_rate": 7.865924910916978e-07, + "loss": 2.0366, + "num_input_tokens_seen": 73126349984, + "step": 139500 + }, + { + "epoch": 1.3308394051778623, + "eval_loss": 1.9510103464126587, + "eval_runtime": 82.8489, + "eval_samples_per_second": 60.351, + "eval_steps_per_second": 15.088, + "num_input_tokens_seen": 73126349984, + "step": 139500 + }, + { + "epoch": 1.3313164076940507, + "grad_norm": 0.1160767450928688, + "learning_rate": 6.371716691419005e-07, + "loss": 2.0374, + "num_input_tokens_seen": 73152559296, + "step": 139550 + }, + { + "epoch": 1.331793410210239, + "grad_norm": 0.11154640465974808, + "learning_rate": 5.034667293427053e-07, + "loss": 2.0385, + "num_input_tokens_seen": 73178773696, + "step": 139600 + }, + { + "epoch": 1.332270412726427, + "grad_norm": 0.11127237975597382, + "learning_rate": 3.854818796385495e-07, + "loss": 2.0281, + "num_input_tokens_seen": 73204985664, + "step": 139650 + }, + { + "epoch": 1.3327474152426153, + "grad_norm": 0.11270651966333389, + "learning_rate": 2.8322083323334415e-07, + "loss": 2.022, + "num_input_tokens_seen": 73231192992, + "step": 139700 + }, + { + "epoch": 1.3332244177588037, + "grad_norm": 0.11388963460922241, + "learning_rate": 1.9668680847356734e-07, + "loss": 2.0305, + "num_input_tokens_seen": 73257397792, + "step": 139750 + }, + { + "epoch": 1.333701420274992, + "grad_norm": 0.11808367073535919, + "learning_rate": 1.2588252874673466e-07, + "loss": 2.0302, + "num_input_tokens_seen": 73283607648, + "step": 139800 + }, + { + "epoch": 1.3341784227911802, + "grad_norm": 0.11369805783033371, + "learning_rate": 7.081022239591173e-08, + "loss": 2.0355, + "num_input_tokens_seen": 73309822048, + "step": 139850 + }, + { + "epoch": 1.3346554253073686, + "grad_norm": 0.11115424335002899, + "learning_rate": 3.147162264971471e-08, + "loss": 2.027, + "num_input_tokens_seen": 73336032384, + "step": 139900 + }, + { + "epoch": 1.3351324278235568, + "grad_norm": 0.11730392277240753, + "learning_rate": 7.867967567354306e-09, + "loss": 2.0268, + "num_input_tokens_seen": 73362242112, + "step": 139950 + }, + { + "epoch": 1.335609430339745, + "grad_norm": 0.11209023743867874, + "learning_rate": 0.0, + "loss": 2.0315, + "num_input_tokens_seen": 73388446624, + "step": 140000 + }, + { + "epoch": 1.335609430339745, + "eval_loss": 1.9509990215301514, + "eval_runtime": 82.6099, + "eval_samples_per_second": 60.525, + "eval_steps_per_second": 15.131, + "num_input_tokens_seen": 73388446624, + "step": 140000 + }, + { + "epoch": 1.335609430339745, + "num_input_tokens_seen": 73388446624, + "step": 140000, + "total_flos": 1.2988416447181578e+20, + "train_loss": 1.353257349559239, + "train_runtime": 830736.6977, + "train_samples_per_second": 86.285, + "train_steps_per_second": 0.169, + "train_tokens_per_second": 88355.697 + } + ], + "logging_steps": 50, + "max_steps": 140000, + "num_input_tokens_seen": 73388446624, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2988416447181578e+20, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}