{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.492654756029168, "eval_steps": 500, "global_step": 159000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00047700251618827287, "grad_norm": 0.7297705411911011, "learning_rate": 0.0001, "loss": 9.0134, "num_input_tokens_seen": 26214400, "step": 50 }, { "epoch": 0.0009540050323765457, "grad_norm": 0.9636281132698059, "learning_rate": 0.0002, "loss": 7.1965, "num_input_tokens_seen": 52419296, "step": 100 }, { "epoch": 0.0014310075485648188, "grad_norm": 0.538168728351593, "learning_rate": 0.0003, "loss": 6.2716, "num_input_tokens_seen": 78626240, "step": 150 }, { "epoch": 0.0019080100647530915, "grad_norm": 0.9969781041145325, "learning_rate": 0.0004, "loss": 5.6452, "num_input_tokens_seen": 104833632, "step": 200 }, { "epoch": 0.0023850125809413646, "grad_norm": 0.8100546002388, "learning_rate": 0.0005, "loss": 5.2224, "num_input_tokens_seen": 131047584, "step": 250 }, { "epoch": 0.0028620150971296375, "grad_norm": 0.4632050693035126, "learning_rate": 0.0006, "loss": 4.8733, "num_input_tokens_seen": 157260544, "step": 300 }, { "epoch": 0.0033390176133179105, "grad_norm": 2.2721216678619385, "learning_rate": 0.0007, "loss": 4.5928, "num_input_tokens_seen": 183472896, "step": 350 }, { "epoch": 0.003816020129506183, "grad_norm": 0.244660422205925, "learning_rate": 0.0008, "loss": 4.4136, "num_input_tokens_seen": 209685664, "step": 400 }, { "epoch": 0.004293022645694456, "grad_norm": 0.3138381540775299, "learning_rate": 0.0009000000000000001, "loss": 4.0552, "num_input_tokens_seen": 235896448, "step": 450 }, { "epoch": 0.004770025161882729, "grad_norm": 0.27429279685020447, "learning_rate": 0.001, "loss": 3.8584, "num_input_tokens_seen": 262104320, "step": 500 }, { "epoch": 0.004770025161882729, "eval_loss": 3.676422357559204, "eval_runtime": 80.6607, "eval_samples_per_second": 61.988, "eval_steps_per_second": 15.497, "num_input_tokens_seen": 262104320, "step": 500 }, { "epoch": 0.005247027678071002, "grad_norm": 0.20896735787391663, "learning_rate": 0.001, "loss": 3.6992, "num_input_tokens_seen": 288318720, "step": 550 }, { "epoch": 0.005724030194259275, "grad_norm": 0.1993994414806366, "learning_rate": 0.001, "loss": 3.5839, "num_input_tokens_seen": 314519904, "step": 600 }, { "epoch": 0.0062010327104475476, "grad_norm": 0.21841953694820404, "learning_rate": 0.001, "loss": 3.472, "num_input_tokens_seen": 340725984, "step": 650 }, { "epoch": 0.006678035226635821, "grad_norm": 0.17900177836418152, "learning_rate": 0.001, "loss": 3.3972, "num_input_tokens_seen": 366937216, "step": 700 }, { "epoch": 0.007155037742824093, "grad_norm": 0.1761142611503601, "learning_rate": 0.001, "loss": 3.3383, "num_input_tokens_seen": 393150592, "step": 750 }, { "epoch": 0.007632040259012366, "grad_norm": 0.19628353416919708, "learning_rate": 0.001, "loss": 3.2933, "num_input_tokens_seen": 419364992, "step": 800 }, { "epoch": 0.008109042775200638, "grad_norm": 0.18105614185333252, "learning_rate": 0.001, "loss": 3.2451, "num_input_tokens_seen": 445578432, "step": 850 }, { "epoch": 0.008586045291388912, "grad_norm": 0.15662750601768494, "learning_rate": 0.001, "loss": 3.2089, "num_input_tokens_seen": 471790144, "step": 900 }, { "epoch": 0.009063047807577185, "grad_norm": 0.16136744618415833, "learning_rate": 0.001, "loss": 3.1683, "num_input_tokens_seen": 498004544, "step": 950 }, { "epoch": 0.009540050323765458, "grad_norm": 0.16362419724464417, "learning_rate": 0.001, "loss": 3.1457, "num_input_tokens_seen": 524218944, "step": 1000 }, { "epoch": 0.009540050323765458, "eval_loss": 3.0411295890808105, "eval_runtime": 80.4796, "eval_samples_per_second": 62.128, "eval_steps_per_second": 15.532, "num_input_tokens_seen": 524218944, "step": 1000 }, { "epoch": 0.01001705283995373, "grad_norm": 0.15351758897304535, "learning_rate": 0.001, "loss": 3.1183, "num_input_tokens_seen": 550433344, "step": 1050 }, { "epoch": 0.010494055356142003, "grad_norm": 0.1585114449262619, "learning_rate": 0.001, "loss": 3.085, "num_input_tokens_seen": 576647744, "step": 1100 }, { "epoch": 0.010971057872330277, "grad_norm": 0.14503733813762665, "learning_rate": 0.001, "loss": 3.0629, "num_input_tokens_seen": 602861312, "step": 1150 }, { "epoch": 0.01144806038851855, "grad_norm": 0.14188329875469208, "learning_rate": 0.001, "loss": 3.0349, "num_input_tokens_seen": 629073184, "step": 1200 }, { "epoch": 0.011925062904706822, "grad_norm": 0.14688891172409058, "learning_rate": 0.001, "loss": 3.0177, "num_input_tokens_seen": 655280128, "step": 1250 }, { "epoch": 0.012402065420895095, "grad_norm": 0.1432763636112213, "learning_rate": 0.001, "loss": 3.0108, "num_input_tokens_seen": 681494048, "step": 1300 }, { "epoch": 0.012879067937083368, "grad_norm": 0.13625910878181458, "learning_rate": 0.001, "loss": 2.9809, "num_input_tokens_seen": 707703840, "step": 1350 }, { "epoch": 0.013356070453271642, "grad_norm": 0.13593867421150208, "learning_rate": 0.001, "loss": 2.9671, "num_input_tokens_seen": 733906080, "step": 1400 }, { "epoch": 0.013833072969459913, "grad_norm": 0.1373436599969864, "learning_rate": 0.001, "loss": 2.9487, "num_input_tokens_seen": 760116640, "step": 1450 }, { "epoch": 0.014310075485648187, "grad_norm": 0.13173167407512665, "learning_rate": 0.001, "loss": 2.9472, "num_input_tokens_seen": 786331040, "step": 1500 }, { "epoch": 0.014310075485648187, "eval_loss": 2.8491108417510986, "eval_runtime": 81.5351, "eval_samples_per_second": 61.323, "eval_steps_per_second": 15.331, "num_input_tokens_seen": 786331040, "step": 1500 }, { "epoch": 0.01478707800183646, "grad_norm": 0.12925633788108826, "learning_rate": 0.001, "loss": 2.9319, "num_input_tokens_seen": 812544672, "step": 1550 }, { "epoch": 0.015264080518024732, "grad_norm": 0.1303853690624237, "learning_rate": 0.001, "loss": 2.9005, "num_input_tokens_seen": 838745248, "step": 1600 }, { "epoch": 0.015741083034213007, "grad_norm": 0.13408446311950684, "learning_rate": 0.001, "loss": 2.9056, "num_input_tokens_seen": 864955200, "step": 1650 }, { "epoch": 0.016218085550401277, "grad_norm": 0.12651245296001434, "learning_rate": 0.001, "loss": 2.8906, "num_input_tokens_seen": 891169600, "step": 1700 }, { "epoch": 0.01669508806658955, "grad_norm": 0.12796996533870697, "learning_rate": 0.001, "loss": 2.867, "num_input_tokens_seen": 917366784, "step": 1750 }, { "epoch": 0.017172090582777823, "grad_norm": 0.12910747528076172, "learning_rate": 0.001, "loss": 2.8778, "num_input_tokens_seen": 943571520, "step": 1800 }, { "epoch": 0.017649093098966097, "grad_norm": 0.1252501755952835, "learning_rate": 0.001, "loss": 2.862, "num_input_tokens_seen": 969782240, "step": 1850 }, { "epoch": 0.01812609561515437, "grad_norm": 0.12633615732192993, "learning_rate": 0.001, "loss": 2.8504, "num_input_tokens_seen": 995995232, "step": 1900 }, { "epoch": 0.018603098131342644, "grad_norm": 0.13014598190784454, "learning_rate": 0.001, "loss": 2.8469, "num_input_tokens_seen": 1022209632, "step": 1950 }, { "epoch": 0.019080100647530917, "grad_norm": 0.12546297907829285, "learning_rate": 0.001, "loss": 2.8357, "num_input_tokens_seen": 1048424032, "step": 2000 }, { "epoch": 0.019080100647530917, "eval_loss": 2.7469162940979004, "eval_runtime": 81.156, "eval_samples_per_second": 61.61, "eval_steps_per_second": 15.402, "num_input_tokens_seen": 1048424032, "step": 2000 }, { "epoch": 0.01955710316371919, "grad_norm": 0.12093319743871689, "learning_rate": 0.001, "loss": 2.8284, "num_input_tokens_seen": 1074632192, "step": 2050 }, { "epoch": 0.02003410567990746, "grad_norm": 0.12063843011856079, "learning_rate": 0.001, "loss": 2.8207, "num_input_tokens_seen": 1100838272, "step": 2100 }, { "epoch": 0.020511108196095734, "grad_norm": 0.12201642990112305, "learning_rate": 0.001, "loss": 2.82, "num_input_tokens_seen": 1127052672, "step": 2150 }, { "epoch": 0.020988110712284007, "grad_norm": 0.1250978708267212, "learning_rate": 0.001, "loss": 2.7882, "num_input_tokens_seen": 1153256992, "step": 2200 }, { "epoch": 0.02146511322847228, "grad_norm": 0.12314685434103012, "learning_rate": 0.001, "loss": 2.7836, "num_input_tokens_seen": 1179454080, "step": 2250 }, { "epoch": 0.021942115744660554, "grad_norm": 0.12332645803689957, "learning_rate": 0.001, "loss": 2.7729, "num_input_tokens_seen": 1205657440, "step": 2300 }, { "epoch": 0.022419118260848827, "grad_norm": 0.12105974555015564, "learning_rate": 0.001, "loss": 2.7852, "num_input_tokens_seen": 1231864768, "step": 2350 }, { "epoch": 0.0228961207770371, "grad_norm": 0.1196957528591156, "learning_rate": 0.001, "loss": 2.7846, "num_input_tokens_seen": 1258076896, "step": 2400 }, { "epoch": 0.02337312329322537, "grad_norm": 0.11613446474075317, "learning_rate": 0.001, "loss": 2.7633, "num_input_tokens_seen": 1284287264, "step": 2450 }, { "epoch": 0.023850125809413644, "grad_norm": 0.11567731201648712, "learning_rate": 0.001, "loss": 2.7564, "num_input_tokens_seen": 1310500000, "step": 2500 }, { "epoch": 0.023850125809413644, "eval_loss": 2.682379722595215, "eval_runtime": 81.2078, "eval_samples_per_second": 61.57, "eval_steps_per_second": 15.393, "num_input_tokens_seen": 1310500000, "step": 2500 }, { "epoch": 0.024327128325601917, "grad_norm": 0.11191745847463608, "learning_rate": 0.001, "loss": 2.763, "num_input_tokens_seen": 1336707360, "step": 2550 }, { "epoch": 0.02480413084179019, "grad_norm": 0.12129329890012741, "learning_rate": 0.001, "loss": 2.7546, "num_input_tokens_seen": 1362918912, "step": 2600 }, { "epoch": 0.025281133357978464, "grad_norm": 0.11706431955099106, "learning_rate": 0.001, "loss": 2.7437, "num_input_tokens_seen": 1389127904, "step": 2650 }, { "epoch": 0.025758135874166737, "grad_norm": 0.11681405454874039, "learning_rate": 0.001, "loss": 2.7439, "num_input_tokens_seen": 1415333568, "step": 2700 }, { "epoch": 0.02623513839035501, "grad_norm": 0.11216867715120316, "learning_rate": 0.001, "loss": 2.7292, "num_input_tokens_seen": 1441547968, "step": 2750 }, { "epoch": 0.026712140906543284, "grad_norm": 0.10973285883665085, "learning_rate": 0.001, "loss": 2.7223, "num_input_tokens_seen": 1467757216, "step": 2800 }, { "epoch": 0.027189143422731554, "grad_norm": 0.11623947322368622, "learning_rate": 0.001, "loss": 2.729, "num_input_tokens_seen": 1493966976, "step": 2850 }, { "epoch": 0.027666145938919827, "grad_norm": 0.11490777134895325, "learning_rate": 0.001, "loss": 2.7275, "num_input_tokens_seen": 1520179488, "step": 2900 }, { "epoch": 0.0281431484551081, "grad_norm": 0.11005893349647522, "learning_rate": 0.001, "loss": 2.7151, "num_input_tokens_seen": 1546390432, "step": 2950 }, { "epoch": 0.028620150971296374, "grad_norm": 0.11037708073854446, "learning_rate": 0.001, "loss": 2.719, "num_input_tokens_seen": 1572599776, "step": 3000 }, { "epoch": 0.028620150971296374, "eval_loss": 2.629970073699951, "eval_runtime": 79.9671, "eval_samples_per_second": 62.526, "eval_steps_per_second": 15.631, "num_input_tokens_seen": 1572599776, "step": 3000 }, { "epoch": 0.029097153487484647, "grad_norm": 0.11220329999923706, "learning_rate": 0.001, "loss": 2.7135, "num_input_tokens_seen": 1598814176, "step": 3050 }, { "epoch": 0.02957415600367292, "grad_norm": 0.1040477603673935, "learning_rate": 0.001, "loss": 2.7088, "num_input_tokens_seen": 1625028576, "step": 3100 }, { "epoch": 0.030051158519861194, "grad_norm": 0.11202716827392578, "learning_rate": 0.001, "loss": 2.6951, "num_input_tokens_seen": 1651241664, "step": 3150 }, { "epoch": 0.030528161036049464, "grad_norm": 0.11337998509407043, "learning_rate": 0.001, "loss": 2.6895, "num_input_tokens_seen": 1677453952, "step": 3200 }, { "epoch": 0.031005163552237737, "grad_norm": 0.10991177707910538, "learning_rate": 0.001, "loss": 2.6833, "num_input_tokens_seen": 1703668064, "step": 3250 }, { "epoch": 0.031482166068426014, "grad_norm": 0.11068691313266754, "learning_rate": 0.001, "loss": 2.6861, "num_input_tokens_seen": 1729882464, "step": 3300 }, { "epoch": 0.031959168584614284, "grad_norm": 0.10345873981714249, "learning_rate": 0.001, "loss": 2.6793, "num_input_tokens_seen": 1756093888, "step": 3350 }, { "epoch": 0.032436171100802554, "grad_norm": 0.10937945544719696, "learning_rate": 0.001, "loss": 2.6793, "num_input_tokens_seen": 1782308288, "step": 3400 }, { "epoch": 0.03291317361699083, "grad_norm": 0.10656026005744934, "learning_rate": 0.001, "loss": 2.6773, "num_input_tokens_seen": 1808520352, "step": 3450 }, { "epoch": 0.0333901761331791, "grad_norm": 0.10830007493495941, "learning_rate": 0.001, "loss": 2.664, "num_input_tokens_seen": 1834732864, "step": 3500 }, { "epoch": 0.0333901761331791, "eval_loss": 2.590848684310913, "eval_runtime": 80.8416, "eval_samples_per_second": 61.849, "eval_steps_per_second": 15.462, "num_input_tokens_seen": 1834732864, "step": 3500 }, { "epoch": 0.03386717864936738, "grad_norm": 0.10620469599962234, "learning_rate": 0.001, "loss": 2.6712, "num_input_tokens_seen": 1860937952, "step": 3550 }, { "epoch": 0.03434418116555565, "grad_norm": 0.10916534811258316, "learning_rate": 0.001, "loss": 2.6589, "num_input_tokens_seen": 1887151168, "step": 3600 }, { "epoch": 0.034821183681743924, "grad_norm": 0.1078685000538826, "learning_rate": 0.001, "loss": 2.6528, "num_input_tokens_seen": 1913358112, "step": 3650 }, { "epoch": 0.035298186197932194, "grad_norm": 0.10825319588184357, "learning_rate": 0.001, "loss": 2.6542, "num_input_tokens_seen": 1939568448, "step": 3700 }, { "epoch": 0.035775188714120464, "grad_norm": 0.10253206640481949, "learning_rate": 0.001, "loss": 2.6568, "num_input_tokens_seen": 1965778912, "step": 3750 }, { "epoch": 0.03625219123030874, "grad_norm": 0.10500983893871307, "learning_rate": 0.001, "loss": 2.6474, "num_input_tokens_seen": 1991992192, "step": 3800 }, { "epoch": 0.03672919374649701, "grad_norm": 0.11013150215148926, "learning_rate": 0.001, "loss": 2.6529, "num_input_tokens_seen": 2018206592, "step": 3850 }, { "epoch": 0.03720619626268529, "grad_norm": 0.10353852063417435, "learning_rate": 0.001, "loss": 2.6254, "num_input_tokens_seen": 2044403936, "step": 3900 }, { "epoch": 0.03768319877887356, "grad_norm": 0.11490489542484283, "learning_rate": 0.001, "loss": 2.639, "num_input_tokens_seen": 2070618336, "step": 3950 }, { "epoch": 0.038160201295061834, "grad_norm": 0.10220393538475037, "learning_rate": 0.001, "loss": 2.6351, "num_input_tokens_seen": 2096832320, "step": 4000 }, { "epoch": 0.038160201295061834, "eval_loss": 2.5540711879730225, "eval_runtime": 80.3545, "eval_samples_per_second": 62.224, "eval_steps_per_second": 15.556, "num_input_tokens_seen": 2096832320, "step": 4000 }, { "epoch": 0.038637203811250104, "grad_norm": 0.10771480202674866, "learning_rate": 0.001, "loss": 2.6332, "num_input_tokens_seen": 2123043360, "step": 4050 }, { "epoch": 0.03911420632743838, "grad_norm": 0.10670652985572815, "learning_rate": 0.001, "loss": 2.6188, "num_input_tokens_seen": 2149257312, "step": 4100 }, { "epoch": 0.03959120884362665, "grad_norm": 0.10759977996349335, "learning_rate": 0.001, "loss": 2.6244, "num_input_tokens_seen": 2175471424, "step": 4150 }, { "epoch": 0.04006821135981492, "grad_norm": 0.10374791920185089, "learning_rate": 0.001, "loss": 2.6311, "num_input_tokens_seen": 2201682208, "step": 4200 }, { "epoch": 0.0405452138760032, "grad_norm": 0.11101187020540237, "learning_rate": 0.001, "loss": 2.6163, "num_input_tokens_seen": 2227895840, "step": 4250 }, { "epoch": 0.04102221639219147, "grad_norm": 0.11088625341653824, "learning_rate": 0.001, "loss": 2.6197, "num_input_tokens_seen": 2254107488, "step": 4300 }, { "epoch": 0.041499218908379744, "grad_norm": 0.10880734026432037, "learning_rate": 0.001, "loss": 2.6176, "num_input_tokens_seen": 2280321504, "step": 4350 }, { "epoch": 0.041976221424568014, "grad_norm": 0.10170961171388626, "learning_rate": 0.001, "loss": 2.6257, "num_input_tokens_seen": 2306535904, "step": 4400 }, { "epoch": 0.04245322394075629, "grad_norm": 0.11956244707107544, "learning_rate": 0.001, "loss": 2.5995, "num_input_tokens_seen": 2332750304, "step": 4450 }, { "epoch": 0.04293022645694456, "grad_norm": 0.11314979940652847, "learning_rate": 0.001, "loss": 2.5897, "num_input_tokens_seen": 2358960576, "step": 4500 }, { "epoch": 0.04293022645694456, "eval_loss": 2.518724203109741, "eval_runtime": 81.1978, "eval_samples_per_second": 61.578, "eval_steps_per_second": 15.395, "num_input_tokens_seen": 2358960576, "step": 4500 }, { "epoch": 0.04340722897313283, "grad_norm": 0.1124994158744812, "learning_rate": 0.001, "loss": 2.5951, "num_input_tokens_seen": 2385174976, "step": 4550 }, { "epoch": 0.04388423148932111, "grad_norm": 0.10833606123924255, "learning_rate": 0.001, "loss": 2.5963, "num_input_tokens_seen": 2411389376, "step": 4600 }, { "epoch": 0.04436123400550938, "grad_norm": 0.10818412154912949, "learning_rate": 0.001, "loss": 2.5792, "num_input_tokens_seen": 2437602528, "step": 4650 }, { "epoch": 0.044838236521697654, "grad_norm": 0.11943142861127853, "learning_rate": 0.001, "loss": 2.5782, "num_input_tokens_seen": 2463816672, "step": 4700 }, { "epoch": 0.045315239037885924, "grad_norm": 0.11240798234939575, "learning_rate": 0.001, "loss": 2.5745, "num_input_tokens_seen": 2490026592, "step": 4750 }, { "epoch": 0.0457922415540742, "grad_norm": 0.11156616359949112, "learning_rate": 0.001, "loss": 2.5825, "num_input_tokens_seen": 2516240992, "step": 4800 }, { "epoch": 0.04626924407026247, "grad_norm": 0.121095672249794, "learning_rate": 0.001, "loss": 2.5813, "num_input_tokens_seen": 2542455392, "step": 4850 }, { "epoch": 0.04674624658645074, "grad_norm": 0.11107343435287476, "learning_rate": 0.001, "loss": 2.5752, "num_input_tokens_seen": 2568666624, "step": 4900 }, { "epoch": 0.04722324910263902, "grad_norm": 0.10824497044086456, "learning_rate": 0.001, "loss": 2.5692, "num_input_tokens_seen": 2594875456, "step": 4950 }, { "epoch": 0.04770025161882729, "grad_norm": 0.11280784755945206, "learning_rate": 0.001, "loss": 2.5672, "num_input_tokens_seen": 2621089856, "step": 5000 }, { "epoch": 0.04770025161882729, "eval_loss": 2.490786552429199, "eval_runtime": 80.5954, "eval_samples_per_second": 62.038, "eval_steps_per_second": 15.51, "num_input_tokens_seen": 2621089856, "step": 5000 }, { "epoch": 0.048177254135015564, "grad_norm": 0.11431359499692917, "learning_rate": 0.001, "loss": 2.569, "num_input_tokens_seen": 2647300640, "step": 5050 }, { "epoch": 0.048654256651203834, "grad_norm": 0.12080084532499313, "learning_rate": 0.001, "loss": 2.5642, "num_input_tokens_seen": 2673510592, "step": 5100 }, { "epoch": 0.04913125916739211, "grad_norm": 0.11316218972206116, "learning_rate": 0.001, "loss": 2.5716, "num_input_tokens_seen": 2699713920, "step": 5150 }, { "epoch": 0.04960826168358038, "grad_norm": 0.1254076361656189, "learning_rate": 0.001, "loss": 2.5547, "num_input_tokens_seen": 2725913312, "step": 5200 }, { "epoch": 0.05008526419976865, "grad_norm": 0.11621085554361343, "learning_rate": 0.001, "loss": 2.5575, "num_input_tokens_seen": 2752123328, "step": 5250 }, { "epoch": 0.05056226671595693, "grad_norm": 0.1208173856139183, "learning_rate": 0.001, "loss": 2.5484, "num_input_tokens_seen": 2778334848, "step": 5300 }, { "epoch": 0.0510392692321452, "grad_norm": 0.11889180541038513, "learning_rate": 0.001, "loss": 2.5501, "num_input_tokens_seen": 2804545664, "step": 5350 }, { "epoch": 0.051516271748333474, "grad_norm": 0.11486896872520447, "learning_rate": 0.001, "loss": 2.5472, "num_input_tokens_seen": 2830748096, "step": 5400 }, { "epoch": 0.051993274264521744, "grad_norm": 0.11431973427534103, "learning_rate": 0.001, "loss": 2.5494, "num_input_tokens_seen": 2856962496, "step": 5450 }, { "epoch": 0.05247027678071002, "grad_norm": 0.11589290201663971, "learning_rate": 0.001, "loss": 2.5412, "num_input_tokens_seen": 2883166048, "step": 5500 }, { "epoch": 0.05247027678071002, "eval_loss": 2.4638915061950684, "eval_runtime": 80.0714, "eval_samples_per_second": 62.444, "eval_steps_per_second": 15.611, "num_input_tokens_seen": 2883166048, "step": 5500 }, { "epoch": 0.05294727929689829, "grad_norm": 0.11737602949142456, "learning_rate": 0.001, "loss": 2.5458, "num_input_tokens_seen": 2909379680, "step": 5550 }, { "epoch": 0.05342428181308657, "grad_norm": 0.11384102702140808, "learning_rate": 0.001, "loss": 2.5521, "num_input_tokens_seen": 2935594080, "step": 5600 }, { "epoch": 0.05390128432927484, "grad_norm": 0.12825793027877808, "learning_rate": 0.001, "loss": 2.5449, "num_input_tokens_seen": 2961804896, "step": 5650 }, { "epoch": 0.05437828684546311, "grad_norm": 0.11516230553388596, "learning_rate": 0.001, "loss": 2.5306, "num_input_tokens_seen": 2988008128, "step": 5700 }, { "epoch": 0.054855289361651384, "grad_norm": 0.11697406321763992, "learning_rate": 0.001, "loss": 2.5265, "num_input_tokens_seen": 3014213824, "step": 5750 }, { "epoch": 0.055332291877839654, "grad_norm": 0.1262071430683136, "learning_rate": 0.001, "loss": 2.5359, "num_input_tokens_seen": 3040422720, "step": 5800 }, { "epoch": 0.05580929439402793, "grad_norm": 0.11729097366333008, "learning_rate": 0.001, "loss": 2.5339, "num_input_tokens_seen": 3066632448, "step": 5850 }, { "epoch": 0.0562862969102162, "grad_norm": 0.12072544544935226, "learning_rate": 0.001, "loss": 2.5184, "num_input_tokens_seen": 3092846848, "step": 5900 }, { "epoch": 0.05676329942640448, "grad_norm": 0.12556667625904083, "learning_rate": 0.001, "loss": 2.5229, "num_input_tokens_seen": 3119043104, "step": 5950 }, { "epoch": 0.05724030194259275, "grad_norm": 0.13290442526340485, "learning_rate": 0.001, "loss": 2.5194, "num_input_tokens_seen": 3145255744, "step": 6000 }, { "epoch": 0.05724030194259275, "eval_loss": 2.442291736602783, "eval_runtime": 80.47, "eval_samples_per_second": 62.135, "eval_steps_per_second": 15.534, "num_input_tokens_seen": 3145255744, "step": 6000 }, { "epoch": 0.05771730445878102, "grad_norm": 0.1415167599916458, "learning_rate": 0.001, "loss": 2.5221, "num_input_tokens_seen": 3171470144, "step": 6050 }, { "epoch": 0.058194306974969294, "grad_norm": 0.11889927089214325, "learning_rate": 0.001, "loss": 2.5192, "num_input_tokens_seen": 3197676704, "step": 6100 }, { "epoch": 0.058671309491157564, "grad_norm": 0.12153992801904678, "learning_rate": 0.001, "loss": 2.5166, "num_input_tokens_seen": 3223884160, "step": 6150 }, { "epoch": 0.05914831200734584, "grad_norm": 0.11614126712083817, "learning_rate": 0.001, "loss": 2.5287, "num_input_tokens_seen": 3250096704, "step": 6200 }, { "epoch": 0.05962531452353411, "grad_norm": 0.1198962926864624, "learning_rate": 0.001, "loss": 2.5111, "num_input_tokens_seen": 3276311040, "step": 6250 }, { "epoch": 0.06010231703972239, "grad_norm": 0.13005641102790833, "learning_rate": 0.001, "loss": 2.509, "num_input_tokens_seen": 3302517568, "step": 6300 }, { "epoch": 0.06057931955591066, "grad_norm": 0.11713956296443939, "learning_rate": 0.001, "loss": 2.5089, "num_input_tokens_seen": 3328719296, "step": 6350 }, { "epoch": 0.06105632207209893, "grad_norm": 0.11161922663450241, "learning_rate": 0.001, "loss": 2.5082, "num_input_tokens_seen": 3354930368, "step": 6400 }, { "epoch": 0.061533324588287204, "grad_norm": 0.12296202778816223, "learning_rate": 0.001, "loss": 2.5102, "num_input_tokens_seen": 3381142240, "step": 6450 }, { "epoch": 0.062010327104475474, "grad_norm": 0.11225474625825882, "learning_rate": 0.001, "loss": 2.5105, "num_input_tokens_seen": 3407356352, "step": 6500 }, { "epoch": 0.062010327104475474, "eval_loss": 2.4205968379974365, "eval_runtime": 80.5402, "eval_samples_per_second": 62.081, "eval_steps_per_second": 15.52, "num_input_tokens_seen": 3407356352, "step": 6500 }, { "epoch": 0.06248732962066375, "grad_norm": 0.12702156603336334, "learning_rate": 0.001, "loss": 2.5075, "num_input_tokens_seen": 3433569248, "step": 6550 }, { "epoch": 0.06296433213685203, "grad_norm": 0.12885423004627228, "learning_rate": 0.001, "loss": 2.4859, "num_input_tokens_seen": 3459777248, "step": 6600 }, { "epoch": 0.06344133465304029, "grad_norm": 0.13935446739196777, "learning_rate": 0.001, "loss": 2.5114, "num_input_tokens_seen": 3485989376, "step": 6650 }, { "epoch": 0.06391833716922857, "grad_norm": 0.12149051576852798, "learning_rate": 0.001, "loss": 2.4924, "num_input_tokens_seen": 3512203776, "step": 6700 }, { "epoch": 0.06439533968541684, "grad_norm": 0.12380675226449966, "learning_rate": 0.001, "loss": 2.4963, "num_input_tokens_seen": 3538413504, "step": 6750 }, { "epoch": 0.06487234220160511, "grad_norm": 0.12020547688007355, "learning_rate": 0.001, "loss": 2.4924, "num_input_tokens_seen": 3564626560, "step": 6800 }, { "epoch": 0.06534934471779338, "grad_norm": 0.12433449178934097, "learning_rate": 0.001, "loss": 2.4835, "num_input_tokens_seen": 3590833408, "step": 6850 }, { "epoch": 0.06582634723398166, "grad_norm": 0.11073850840330124, "learning_rate": 0.001, "loss": 2.49, "num_input_tokens_seen": 3617045664, "step": 6900 }, { "epoch": 0.06630334975016994, "grad_norm": 0.12657274305820465, "learning_rate": 0.001, "loss": 2.4922, "num_input_tokens_seen": 3643256224, "step": 6950 }, { "epoch": 0.0667803522663582, "grad_norm": 0.13630461692810059, "learning_rate": 0.001, "loss": 2.4816, "num_input_tokens_seen": 3669468320, "step": 7000 }, { "epoch": 0.0667803522663582, "eval_loss": 2.4051008224487305, "eval_runtime": 80.911, "eval_samples_per_second": 61.796, "eval_steps_per_second": 15.449, "num_input_tokens_seen": 3669468320, "step": 7000 }, { "epoch": 0.06725735478254648, "grad_norm": 0.11266546696424484, "learning_rate": 0.001, "loss": 2.4822, "num_input_tokens_seen": 3695674400, "step": 7050 }, { "epoch": 0.06773435729873475, "grad_norm": 0.13039050996303558, "learning_rate": 0.001, "loss": 2.4885, "num_input_tokens_seen": 3721882592, "step": 7100 }, { "epoch": 0.06821135981492302, "grad_norm": 0.11898328363895416, "learning_rate": 0.001, "loss": 2.4756, "num_input_tokens_seen": 3748091968, "step": 7150 }, { "epoch": 0.0686883623311113, "grad_norm": 0.11951896548271179, "learning_rate": 0.001, "loss": 2.4719, "num_input_tokens_seen": 3774297216, "step": 7200 }, { "epoch": 0.06916536484729957, "grad_norm": 0.13969680666923523, "learning_rate": 0.001, "loss": 2.4706, "num_input_tokens_seen": 3800509600, "step": 7250 }, { "epoch": 0.06964236736348785, "grad_norm": 0.12787151336669922, "learning_rate": 0.001, "loss": 2.4738, "num_input_tokens_seen": 3826723648, "step": 7300 }, { "epoch": 0.07011936987967611, "grad_norm": 0.13117018342018127, "learning_rate": 0.001, "loss": 2.4735, "num_input_tokens_seen": 3852920288, "step": 7350 }, { "epoch": 0.07059637239586439, "grad_norm": 0.11509765684604645, "learning_rate": 0.001, "loss": 2.4648, "num_input_tokens_seen": 3879127072, "step": 7400 }, { "epoch": 0.07107337491205266, "grad_norm": 0.1272098869085312, "learning_rate": 0.001, "loss": 2.4806, "num_input_tokens_seen": 3905334720, "step": 7450 }, { "epoch": 0.07155037742824093, "grad_norm": 0.1205294206738472, "learning_rate": 0.001, "loss": 2.4688, "num_input_tokens_seen": 3931543616, "step": 7500 }, { "epoch": 0.07155037742824093, "eval_loss": 2.3871288299560547, "eval_runtime": 81.0356, "eval_samples_per_second": 61.701, "eval_steps_per_second": 15.425, "num_input_tokens_seen": 3931543616, "step": 7500 }, { "epoch": 0.0720273799444292, "grad_norm": 0.13648000359535217, "learning_rate": 0.001, "loss": 2.4692, "num_input_tokens_seen": 3957757216, "step": 7550 }, { "epoch": 0.07250438246061748, "grad_norm": 0.13873665034770966, "learning_rate": 0.001, "loss": 2.4642, "num_input_tokens_seen": 3983965696, "step": 7600 }, { "epoch": 0.07298138497680576, "grad_norm": 0.1256738156080246, "learning_rate": 0.001, "loss": 2.4706, "num_input_tokens_seen": 4010175648, "step": 7650 }, { "epoch": 0.07345838749299402, "grad_norm": 0.12166794389486313, "learning_rate": 0.001, "loss": 2.4663, "num_input_tokens_seen": 4036387936, "step": 7700 }, { "epoch": 0.0739353900091823, "grad_norm": 0.1347389817237854, "learning_rate": 0.001, "loss": 2.4678, "num_input_tokens_seen": 4062595136, "step": 7750 }, { "epoch": 0.07441239252537057, "grad_norm": 0.13025853037834167, "learning_rate": 0.001, "loss": 2.4668, "num_input_tokens_seen": 4088807456, "step": 7800 }, { "epoch": 0.07488939504155885, "grad_norm": 0.12036091089248657, "learning_rate": 0.001, "loss": 2.4665, "num_input_tokens_seen": 4115018176, "step": 7850 }, { "epoch": 0.07536639755774711, "grad_norm": 0.12124933302402496, "learning_rate": 0.001, "loss": 2.4564, "num_input_tokens_seen": 4141222464, "step": 7900 }, { "epoch": 0.07584340007393539, "grad_norm": 0.1202184334397316, "learning_rate": 0.001, "loss": 2.4657, "num_input_tokens_seen": 4167436512, "step": 7950 }, { "epoch": 0.07632040259012367, "grad_norm": 0.14438344538211823, "learning_rate": 0.001, "loss": 2.4446, "num_input_tokens_seen": 4193649312, "step": 8000 }, { "epoch": 0.07632040259012367, "eval_loss": 2.3720171451568604, "eval_runtime": 80.9787, "eval_samples_per_second": 61.745, "eval_steps_per_second": 15.436, "num_input_tokens_seen": 4193649312, "step": 8000 }, { "epoch": 0.07679740510631193, "grad_norm": 0.13343645632266998, "learning_rate": 0.001, "loss": 2.44, "num_input_tokens_seen": 4219850656, "step": 8050 }, { "epoch": 0.07727440762250021, "grad_norm": 0.13672953844070435, "learning_rate": 0.001, "loss": 2.4528, "num_input_tokens_seen": 4246062080, "step": 8100 }, { "epoch": 0.07775141013868848, "grad_norm": 0.12469538301229477, "learning_rate": 0.001, "loss": 2.4564, "num_input_tokens_seen": 4272276480, "step": 8150 }, { "epoch": 0.07822841265487676, "grad_norm": 0.1281704306602478, "learning_rate": 0.001, "loss": 2.4448, "num_input_tokens_seen": 4298480576, "step": 8200 }, { "epoch": 0.07870541517106502, "grad_norm": 0.12879879772663116, "learning_rate": 0.001, "loss": 2.4482, "num_input_tokens_seen": 4324682816, "step": 8250 }, { "epoch": 0.0791824176872533, "grad_norm": 0.11960000544786453, "learning_rate": 0.001, "loss": 2.4418, "num_input_tokens_seen": 4350897216, "step": 8300 }, { "epoch": 0.07965942020344158, "grad_norm": 0.13047458231449127, "learning_rate": 0.001, "loss": 2.4595, "num_input_tokens_seen": 4377111616, "step": 8350 }, { "epoch": 0.08013642271962984, "grad_norm": 0.12718771398067474, "learning_rate": 0.001, "loss": 2.4419, "num_input_tokens_seen": 4403326016, "step": 8400 }, { "epoch": 0.08061342523581812, "grad_norm": 0.14239729940891266, "learning_rate": 0.001, "loss": 2.4444, "num_input_tokens_seen": 4429534304, "step": 8450 }, { "epoch": 0.0810904277520064, "grad_norm": 0.12223052978515625, "learning_rate": 0.001, "loss": 2.4318, "num_input_tokens_seen": 4455747616, "step": 8500 }, { "epoch": 0.0810904277520064, "eval_loss": 2.3614137172698975, "eval_runtime": 80.6431, "eval_samples_per_second": 62.002, "eval_steps_per_second": 15.5, "num_input_tokens_seen": 4455747616, "step": 8500 }, { "epoch": 0.08156743026819467, "grad_norm": 0.1250275820493698, "learning_rate": 0.001, "loss": 2.4465, "num_input_tokens_seen": 4481959552, "step": 8550 }, { "epoch": 0.08204443278438293, "grad_norm": 0.13238155841827393, "learning_rate": 0.001, "loss": 2.4396, "num_input_tokens_seen": 4508167424, "step": 8600 }, { "epoch": 0.08252143530057121, "grad_norm": 0.12801779806613922, "learning_rate": 0.001, "loss": 2.444, "num_input_tokens_seen": 4534381504, "step": 8650 }, { "epoch": 0.08299843781675949, "grad_norm": 0.12822921574115753, "learning_rate": 0.001, "loss": 2.4388, "num_input_tokens_seen": 4560591488, "step": 8700 }, { "epoch": 0.08347544033294775, "grad_norm": 0.131358340382576, "learning_rate": 0.001, "loss": 2.4305, "num_input_tokens_seen": 4586805888, "step": 8750 }, { "epoch": 0.08395244284913603, "grad_norm": 0.12687794864177704, "learning_rate": 0.001, "loss": 2.4341, "num_input_tokens_seen": 4613017088, "step": 8800 }, { "epoch": 0.0844294453653243, "grad_norm": 0.12758538126945496, "learning_rate": 0.001, "loss": 2.4328, "num_input_tokens_seen": 4639231200, "step": 8850 }, { "epoch": 0.08490644788151258, "grad_norm": 0.13047289848327637, "learning_rate": 0.001, "loss": 2.4381, "num_input_tokens_seen": 4665445600, "step": 8900 }, { "epoch": 0.08538345039770084, "grad_norm": 0.12238621711730957, "learning_rate": 0.001, "loss": 2.4278, "num_input_tokens_seen": 4691659872, "step": 8950 }, { "epoch": 0.08586045291388912, "grad_norm": 0.1371585875749588, "learning_rate": 0.001, "loss": 2.4292, "num_input_tokens_seen": 4717856864, "step": 9000 }, { "epoch": 0.08586045291388912, "eval_loss": 2.3485047817230225, "eval_runtime": 80.8327, "eval_samples_per_second": 61.856, "eval_steps_per_second": 15.464, "num_input_tokens_seen": 4717856864, "step": 9000 }, { "epoch": 0.0863374554300774, "grad_norm": 0.12939831614494324, "learning_rate": 0.001, "loss": 2.4345, "num_input_tokens_seen": 4744065888, "step": 9050 }, { "epoch": 0.08681445794626566, "grad_norm": 0.1290908306837082, "learning_rate": 0.001, "loss": 2.4216, "num_input_tokens_seen": 4770277888, "step": 9100 }, { "epoch": 0.08729146046245394, "grad_norm": 0.12267202883958817, "learning_rate": 0.001, "loss": 2.4195, "num_input_tokens_seen": 4796489056, "step": 9150 }, { "epoch": 0.08776846297864221, "grad_norm": 0.13644106686115265, "learning_rate": 0.001, "loss": 2.418, "num_input_tokens_seen": 4822694912, "step": 9200 }, { "epoch": 0.08824546549483049, "grad_norm": 0.12562055885791779, "learning_rate": 0.001, "loss": 2.4262, "num_input_tokens_seen": 4848909312, "step": 9250 }, { "epoch": 0.08872246801101875, "grad_norm": 0.12123631685972214, "learning_rate": 0.001, "loss": 2.4178, "num_input_tokens_seen": 4875119552, "step": 9300 }, { "epoch": 0.08919947052720703, "grad_norm": 0.12225483357906342, "learning_rate": 0.001, "loss": 2.4146, "num_input_tokens_seen": 4901319648, "step": 9350 }, { "epoch": 0.08967647304339531, "grad_norm": 0.1262338012456894, "learning_rate": 0.001, "loss": 2.411, "num_input_tokens_seen": 4927533024, "step": 9400 }, { "epoch": 0.09015347555958357, "grad_norm": 0.12114047259092331, "learning_rate": 0.001, "loss": 2.4253, "num_input_tokens_seen": 4953741376, "step": 9450 }, { "epoch": 0.09063047807577185, "grad_norm": 0.12057732045650482, "learning_rate": 0.001, "loss": 2.4151, "num_input_tokens_seen": 4979955776, "step": 9500 }, { "epoch": 0.09063047807577185, "eval_loss": 2.3371479511260986, "eval_runtime": 80.7643, "eval_samples_per_second": 61.909, "eval_steps_per_second": 15.477, "num_input_tokens_seen": 4979955776, "step": 9500 }, { "epoch": 0.09110748059196012, "grad_norm": 0.13011221587657928, "learning_rate": 0.001, "loss": 2.4187, "num_input_tokens_seen": 5006160832, "step": 9550 }, { "epoch": 0.0915844831081484, "grad_norm": 0.133403941988945, "learning_rate": 0.001, "loss": 2.414, "num_input_tokens_seen": 5032374880, "step": 9600 }, { "epoch": 0.09206148562433666, "grad_norm": 0.12261918187141418, "learning_rate": 0.001, "loss": 2.4012, "num_input_tokens_seen": 5058581504, "step": 9650 }, { "epoch": 0.09253848814052494, "grad_norm": 0.13203178346157074, "learning_rate": 0.001, "loss": 2.4058, "num_input_tokens_seen": 5084791232, "step": 9700 }, { "epoch": 0.09301549065671322, "grad_norm": 0.12036694586277008, "learning_rate": 0.001, "loss": 2.4079, "num_input_tokens_seen": 5111005632, "step": 9750 }, { "epoch": 0.09349249317290148, "grad_norm": 0.12211828678846359, "learning_rate": 0.001, "loss": 2.4118, "num_input_tokens_seen": 5137213568, "step": 9800 }, { "epoch": 0.09396949568908976, "grad_norm": 0.1405865103006363, "learning_rate": 0.001, "loss": 2.4128, "num_input_tokens_seen": 5163427424, "step": 9850 }, { "epoch": 0.09444649820527803, "grad_norm": 0.14212754368782043, "learning_rate": 0.001, "loss": 2.4162, "num_input_tokens_seen": 5189637472, "step": 9900 }, { "epoch": 0.09492350072146631, "grad_norm": 0.13048619031906128, "learning_rate": 0.001, "loss": 2.4152, "num_input_tokens_seen": 5215848992, "step": 9950 }, { "epoch": 0.09540050323765457, "grad_norm": 0.13322441279888153, "learning_rate": 0.001, "loss": 2.4056, "num_input_tokens_seen": 5242058496, "step": 10000 }, { "epoch": 0.09540050323765457, "eval_loss": 2.3262879848480225, "eval_runtime": 81.7469, "eval_samples_per_second": 61.164, "eval_steps_per_second": 15.291, "num_input_tokens_seen": 5242058496, "step": 10000 }, { "epoch": 0.09587750575384285, "grad_norm": 0.12825925648212433, "learning_rate": 0.001, "loss": 2.4141, "num_input_tokens_seen": 5268270688, "step": 10050 }, { "epoch": 0.09635450827003113, "grad_norm": 0.12106914073228836, "learning_rate": 0.001, "loss": 2.3985, "num_input_tokens_seen": 5294470400, "step": 10100 }, { "epoch": 0.09683151078621939, "grad_norm": 0.12551487982273102, "learning_rate": 0.001, "loss": 2.4082, "num_input_tokens_seen": 5320668704, "step": 10150 }, { "epoch": 0.09730851330240767, "grad_norm": 0.12404550611972809, "learning_rate": 0.001, "loss": 2.411, "num_input_tokens_seen": 5346877024, "step": 10200 }, { "epoch": 0.09778551581859594, "grad_norm": 0.13011808693408966, "learning_rate": 0.001, "loss": 2.4036, "num_input_tokens_seen": 5373088224, "step": 10250 }, { "epoch": 0.09826251833478422, "grad_norm": 0.14489437639713287, "learning_rate": 0.001, "loss": 2.4013, "num_input_tokens_seen": 5399292544, "step": 10300 }, { "epoch": 0.09873952085097248, "grad_norm": 0.13484328985214233, "learning_rate": 0.001, "loss": 2.4028, "num_input_tokens_seen": 5425504576, "step": 10350 }, { "epoch": 0.09921652336716076, "grad_norm": 0.13810865581035614, "learning_rate": 0.001, "loss": 2.3825, "num_input_tokens_seen": 5451713984, "step": 10400 }, { "epoch": 0.09969352588334904, "grad_norm": 0.12903955578804016, "learning_rate": 0.001, "loss": 2.4072, "num_input_tokens_seen": 5477927488, "step": 10450 }, { "epoch": 0.1001705283995373, "grad_norm": 0.1321643590927124, "learning_rate": 0.001, "loss": 2.3912, "num_input_tokens_seen": 5504131840, "step": 10500 }, { "epoch": 0.1001705283995373, "eval_loss": 2.316988945007324, "eval_runtime": 80.1877, "eval_samples_per_second": 62.354, "eval_steps_per_second": 15.588, "num_input_tokens_seen": 5504131840, "step": 10500 }, { "epoch": 0.10064753091572558, "grad_norm": 0.13744521141052246, "learning_rate": 0.001, "loss": 2.3908, "num_input_tokens_seen": 5530337280, "step": 10550 }, { "epoch": 0.10112453343191385, "grad_norm": 0.14102710783481598, "learning_rate": 0.001, "loss": 2.4009, "num_input_tokens_seen": 5556550688, "step": 10600 }, { "epoch": 0.10160153594810213, "grad_norm": 0.12428227812051773, "learning_rate": 0.001, "loss": 2.4, "num_input_tokens_seen": 5582754848, "step": 10650 }, { "epoch": 0.1020785384642904, "grad_norm": 0.12551705539226532, "learning_rate": 0.001, "loss": 2.3978, "num_input_tokens_seen": 5608963360, "step": 10700 }, { "epoch": 0.10255554098047867, "grad_norm": 0.12045067548751831, "learning_rate": 0.001, "loss": 2.4, "num_input_tokens_seen": 5635168960, "step": 10750 }, { "epoch": 0.10303254349666695, "grad_norm": 0.12914159893989563, "learning_rate": 0.001, "loss": 2.4035, "num_input_tokens_seen": 5661379520, "step": 10800 }, { "epoch": 0.10350954601285522, "grad_norm": 0.1325596123933792, "learning_rate": 0.001, "loss": 2.3917, "num_input_tokens_seen": 5687590496, "step": 10850 }, { "epoch": 0.10398654852904349, "grad_norm": 0.13543546199798584, "learning_rate": 0.001, "loss": 2.3854, "num_input_tokens_seen": 5713802208, "step": 10900 }, { "epoch": 0.10446355104523176, "grad_norm": 0.12515605986118317, "learning_rate": 0.001, "loss": 2.4014, "num_input_tokens_seen": 5740014432, "step": 10950 }, { "epoch": 0.10494055356142004, "grad_norm": 0.12793181836605072, "learning_rate": 0.001, "loss": 2.3781, "num_input_tokens_seen": 5766222432, "step": 11000 }, { "epoch": 0.10494055356142004, "eval_loss": 2.3075389862060547, "eval_runtime": 83.0875, "eval_samples_per_second": 60.178, "eval_steps_per_second": 15.044, "num_input_tokens_seen": 5766222432, "step": 11000 }, { "epoch": 0.1054175560776083, "grad_norm": 0.13516350090503693, "learning_rate": 0.001, "loss": 2.388, "num_input_tokens_seen": 5792429984, "step": 11050 }, { "epoch": 0.10589455859379658, "grad_norm": 0.13579031825065613, "learning_rate": 0.001, "loss": 2.393, "num_input_tokens_seen": 5818639200, "step": 11100 }, { "epoch": 0.10637156110998486, "grad_norm": 0.13308782875537872, "learning_rate": 0.001, "loss": 2.3812, "num_input_tokens_seen": 5844851648, "step": 11150 }, { "epoch": 0.10684856362617313, "grad_norm": 0.12415602058172226, "learning_rate": 0.001, "loss": 2.3932, "num_input_tokens_seen": 5871049088, "step": 11200 }, { "epoch": 0.1073255661423614, "grad_norm": 0.12303244322538376, "learning_rate": 0.001, "loss": 2.3807, "num_input_tokens_seen": 5897261824, "step": 11250 }, { "epoch": 0.10780256865854967, "grad_norm": 0.1346784085035324, "learning_rate": 0.001, "loss": 2.3938, "num_input_tokens_seen": 5923473344, "step": 11300 }, { "epoch": 0.10827957117473795, "grad_norm": 0.133702352643013, "learning_rate": 0.001, "loss": 2.3807, "num_input_tokens_seen": 5949683104, "step": 11350 }, { "epoch": 0.10875657369092621, "grad_norm": 0.14315365254878998, "learning_rate": 0.001, "loss": 2.3837, "num_input_tokens_seen": 5975894688, "step": 11400 }, { "epoch": 0.10923357620711449, "grad_norm": 0.13560393452644348, "learning_rate": 0.001, "loss": 2.3843, "num_input_tokens_seen": 6002107712, "step": 11450 }, { "epoch": 0.10971057872330277, "grad_norm": 0.13891252875328064, "learning_rate": 0.001, "loss": 2.3776, "num_input_tokens_seen": 6028313408, "step": 11500 }, { "epoch": 0.10971057872330277, "eval_loss": 2.297602653503418, "eval_runtime": 82.6077, "eval_samples_per_second": 60.527, "eval_steps_per_second": 15.132, "num_input_tokens_seen": 6028313408, "step": 11500 }, { "epoch": 0.11018758123949104, "grad_norm": 0.13412249088287354, "learning_rate": 0.001, "loss": 2.3752, "num_input_tokens_seen": 6054524992, "step": 11550 }, { "epoch": 0.11066458375567931, "grad_norm": 0.12613041698932648, "learning_rate": 0.001, "loss": 2.3818, "num_input_tokens_seen": 6080738688, "step": 11600 }, { "epoch": 0.11114158627186758, "grad_norm": 0.1549839973449707, "learning_rate": 0.001, "loss": 2.3803, "num_input_tokens_seen": 6106953088, "step": 11650 }, { "epoch": 0.11161858878805586, "grad_norm": 0.12388636916875839, "learning_rate": 0.001, "loss": 2.3816, "num_input_tokens_seen": 6133164992, "step": 11700 }, { "epoch": 0.11209559130424412, "grad_norm": 0.13352590799331665, "learning_rate": 0.001, "loss": 2.3708, "num_input_tokens_seen": 6159376640, "step": 11750 }, { "epoch": 0.1125725938204324, "grad_norm": 0.12554074823856354, "learning_rate": 0.001, "loss": 2.3723, "num_input_tokens_seen": 6185587392, "step": 11800 }, { "epoch": 0.11304959633662068, "grad_norm": 0.12788288295269012, "learning_rate": 0.001, "loss": 2.3847, "num_input_tokens_seen": 6211799456, "step": 11850 }, { "epoch": 0.11352659885280895, "grad_norm": 0.1322234570980072, "learning_rate": 0.001, "loss": 2.3766, "num_input_tokens_seen": 6238009952, "step": 11900 }, { "epoch": 0.11400360136899722, "grad_norm": 0.13440632820129395, "learning_rate": 0.001, "loss": 2.3852, "num_input_tokens_seen": 6264216672, "step": 11950 }, { "epoch": 0.1144806038851855, "grad_norm": 0.12434106320142746, "learning_rate": 0.001, "loss": 2.3759, "num_input_tokens_seen": 6290430912, "step": 12000 }, { "epoch": 0.1144806038851855, "eval_loss": 2.290039539337158, "eval_runtime": 82.0142, "eval_samples_per_second": 60.965, "eval_steps_per_second": 15.241, "num_input_tokens_seen": 6290430912, "step": 12000 }, { "epoch": 0.11495760640137377, "grad_norm": 0.132809117436409, "learning_rate": 0.001, "loss": 2.3768, "num_input_tokens_seen": 6316641216, "step": 12050 }, { "epoch": 0.11543460891756203, "grad_norm": 0.12777090072631836, "learning_rate": 0.001, "loss": 2.3617, "num_input_tokens_seen": 6342855616, "step": 12100 }, { "epoch": 0.11591161143375031, "grad_norm": 0.1328810453414917, "learning_rate": 0.001, "loss": 2.3582, "num_input_tokens_seen": 6369062880, "step": 12150 }, { "epoch": 0.11638861394993859, "grad_norm": 0.13146333396434784, "learning_rate": 0.001, "loss": 2.3629, "num_input_tokens_seen": 6395271424, "step": 12200 }, { "epoch": 0.11686561646612686, "grad_norm": 0.13155700266361237, "learning_rate": 0.001, "loss": 2.3611, "num_input_tokens_seen": 6421478368, "step": 12250 }, { "epoch": 0.11734261898231513, "grad_norm": 0.13666649162769318, "learning_rate": 0.001, "loss": 2.3589, "num_input_tokens_seen": 6447685344, "step": 12300 }, { "epoch": 0.1178196214985034, "grad_norm": 0.12632860243320465, "learning_rate": 0.001, "loss": 2.3583, "num_input_tokens_seen": 6473898912, "step": 12350 }, { "epoch": 0.11829662401469168, "grad_norm": 0.12418720871210098, "learning_rate": 0.001, "loss": 2.362, "num_input_tokens_seen": 6500113312, "step": 12400 }, { "epoch": 0.11877362653087994, "grad_norm": 0.1381850242614746, "learning_rate": 0.001, "loss": 2.3601, "num_input_tokens_seen": 6526318496, "step": 12450 }, { "epoch": 0.11925062904706822, "grad_norm": 0.15137051045894623, "learning_rate": 0.001, "loss": 2.3501, "num_input_tokens_seen": 6552526688, "step": 12500 }, { "epoch": 0.11925062904706822, "eval_loss": 2.2824325561523438, "eval_runtime": 82.5334, "eval_samples_per_second": 60.582, "eval_steps_per_second": 15.145, "num_input_tokens_seen": 6552526688, "step": 12500 }, { "epoch": 0.1197276315632565, "grad_norm": 0.11741863191127777, "learning_rate": 0.001, "loss": 2.3813, "num_input_tokens_seen": 6578735776, "step": 12550 }, { "epoch": 0.12020463407944477, "grad_norm": 0.11584734171628952, "learning_rate": 0.001, "loss": 2.3718, "num_input_tokens_seen": 6604948416, "step": 12600 }, { "epoch": 0.12068163659563304, "grad_norm": 0.13832303881645203, "learning_rate": 0.001, "loss": 2.3622, "num_input_tokens_seen": 6631155680, "step": 12650 }, { "epoch": 0.12115863911182131, "grad_norm": 0.13220873475074768, "learning_rate": 0.001, "loss": 2.3601, "num_input_tokens_seen": 6657368192, "step": 12700 }, { "epoch": 0.12163564162800959, "grad_norm": 0.13639794290065765, "learning_rate": 0.001, "loss": 2.3625, "num_input_tokens_seen": 6683582592, "step": 12750 }, { "epoch": 0.12211264414419785, "grad_norm": 0.12675660848617554, "learning_rate": 0.001, "loss": 2.361, "num_input_tokens_seen": 6709791808, "step": 12800 }, { "epoch": 0.12258964666038613, "grad_norm": 0.12696968019008636, "learning_rate": 0.001, "loss": 2.3654, "num_input_tokens_seen": 6735995008, "step": 12850 }, { "epoch": 0.12306664917657441, "grad_norm": 0.13134369254112244, "learning_rate": 0.001, "loss": 2.358, "num_input_tokens_seen": 6762206080, "step": 12900 }, { "epoch": 0.12354365169276268, "grad_norm": 0.1370420753955841, "learning_rate": 0.001, "loss": 2.3435, "num_input_tokens_seen": 6788420480, "step": 12950 }, { "epoch": 0.12402065420895095, "grad_norm": 0.13414695858955383, "learning_rate": 0.001, "loss": 2.3572, "num_input_tokens_seen": 6814626336, "step": 13000 }, { "epoch": 0.12402065420895095, "eval_loss": 2.275796413421631, "eval_runtime": 80.1335, "eval_samples_per_second": 62.396, "eval_steps_per_second": 15.599, "num_input_tokens_seen": 6814626336, "step": 13000 }, { "epoch": 0.12449765672513922, "grad_norm": 0.1583530604839325, "learning_rate": 0.001, "loss": 2.3532, "num_input_tokens_seen": 6840830240, "step": 13050 }, { "epoch": 0.1249746592413275, "grad_norm": 0.13726601004600525, "learning_rate": 0.001, "loss": 2.3478, "num_input_tokens_seen": 6867035264, "step": 13100 }, { "epoch": 0.12545166175751576, "grad_norm": 0.13253213465213776, "learning_rate": 0.001, "loss": 2.3525, "num_input_tokens_seen": 6893243904, "step": 13150 }, { "epoch": 0.12592866427370406, "grad_norm": 0.14362353086471558, "learning_rate": 0.001, "loss": 2.3557, "num_input_tokens_seen": 6919452384, "step": 13200 }, { "epoch": 0.12640566678989232, "grad_norm": 0.13510292768478394, "learning_rate": 0.001, "loss": 2.3496, "num_input_tokens_seen": 6945653600, "step": 13250 }, { "epoch": 0.12688266930608058, "grad_norm": 0.14929993450641632, "learning_rate": 0.001, "loss": 2.359, "num_input_tokens_seen": 6971868000, "step": 13300 }, { "epoch": 0.12735967182226887, "grad_norm": 0.14635959267616272, "learning_rate": 0.001, "loss": 2.3487, "num_input_tokens_seen": 6998077856, "step": 13350 }, { "epoch": 0.12783667433845713, "grad_norm": 0.129233717918396, "learning_rate": 0.001, "loss": 2.3566, "num_input_tokens_seen": 7024292256, "step": 13400 }, { "epoch": 0.1283136768546454, "grad_norm": 0.13718649744987488, "learning_rate": 0.001, "loss": 2.3528, "num_input_tokens_seen": 7050505088, "step": 13450 }, { "epoch": 0.1287906793708337, "grad_norm": 0.13179470598697662, "learning_rate": 0.001, "loss": 2.3451, "num_input_tokens_seen": 7076718080, "step": 13500 }, { "epoch": 0.1287906793708337, "eval_loss": 2.267688274383545, "eval_runtime": 80.4123, "eval_samples_per_second": 62.18, "eval_steps_per_second": 15.545, "num_input_tokens_seen": 7076718080, "step": 13500 }, { "epoch": 0.12926768188702195, "grad_norm": 0.129612535238266, "learning_rate": 0.001, "loss": 2.3466, "num_input_tokens_seen": 7102928416, "step": 13550 }, { "epoch": 0.12974468440321021, "grad_norm": 0.14502273499965668, "learning_rate": 0.001, "loss": 2.3514, "num_input_tokens_seen": 7129138080, "step": 13600 }, { "epoch": 0.1302216869193985, "grad_norm": 0.12477376312017441, "learning_rate": 0.001, "loss": 2.3498, "num_input_tokens_seen": 7155346432, "step": 13650 }, { "epoch": 0.13069868943558677, "grad_norm": 0.12704899907112122, "learning_rate": 0.001, "loss": 2.3568, "num_input_tokens_seen": 7181560832, "step": 13700 }, { "epoch": 0.13117569195177506, "grad_norm": 0.127015620470047, "learning_rate": 0.001, "loss": 2.344, "num_input_tokens_seen": 7207773952, "step": 13750 }, { "epoch": 0.13165269446796332, "grad_norm": 0.1374967098236084, "learning_rate": 0.001, "loss": 2.3446, "num_input_tokens_seen": 7233985504, "step": 13800 }, { "epoch": 0.13212969698415158, "grad_norm": 0.1342546045780182, "learning_rate": 0.001, "loss": 2.3429, "num_input_tokens_seen": 7260196224, "step": 13850 }, { "epoch": 0.13260669950033988, "grad_norm": 0.13680048286914825, "learning_rate": 0.001, "loss": 2.3499, "num_input_tokens_seen": 7286404832, "step": 13900 }, { "epoch": 0.13308370201652814, "grad_norm": 0.12522684037685394, "learning_rate": 0.001, "loss": 2.3507, "num_input_tokens_seen": 7312617024, "step": 13950 }, { "epoch": 0.1335607045327164, "grad_norm": 0.12328428030014038, "learning_rate": 0.001, "loss": 2.3437, "num_input_tokens_seen": 7338830528, "step": 14000 }, { "epoch": 0.1335607045327164, "eval_loss": 2.26138973236084, "eval_runtime": 82.7425, "eval_samples_per_second": 60.428, "eval_steps_per_second": 15.107, "num_input_tokens_seen": 7338830528, "step": 14000 }, { "epoch": 0.1340377070489047, "grad_norm": 0.1246449276804924, "learning_rate": 0.001, "loss": 2.353, "num_input_tokens_seen": 7365043520, "step": 14050 }, { "epoch": 0.13451470956509295, "grad_norm": 0.1269921213388443, "learning_rate": 0.001, "loss": 2.347, "num_input_tokens_seen": 7391257920, "step": 14100 }, { "epoch": 0.13499171208128122, "grad_norm": 0.13668124377727509, "learning_rate": 0.001, "loss": 2.3471, "num_input_tokens_seen": 7417467648, "step": 14150 }, { "epoch": 0.1354687145974695, "grad_norm": 0.15413053333759308, "learning_rate": 0.001, "loss": 2.3423, "num_input_tokens_seen": 7443679424, "step": 14200 }, { "epoch": 0.13594571711365777, "grad_norm": 0.14467491209506989, "learning_rate": 0.001, "loss": 2.3504, "num_input_tokens_seen": 7469890208, "step": 14250 }, { "epoch": 0.13642271962984603, "grad_norm": 0.14191295206546783, "learning_rate": 0.001, "loss": 2.3378, "num_input_tokens_seen": 7496093536, "step": 14300 }, { "epoch": 0.13689972214603432, "grad_norm": 0.14077533781528473, "learning_rate": 0.001, "loss": 2.3351, "num_input_tokens_seen": 7522307936, "step": 14350 }, { "epoch": 0.1373767246622226, "grad_norm": 0.13784116506576538, "learning_rate": 0.001, "loss": 2.3276, "num_input_tokens_seen": 7548522112, "step": 14400 }, { "epoch": 0.13785372717841088, "grad_norm": 0.13621552288532257, "learning_rate": 0.001, "loss": 2.3434, "num_input_tokens_seen": 7574731968, "step": 14450 }, { "epoch": 0.13833072969459914, "grad_norm": 0.1428932249546051, "learning_rate": 0.001, "loss": 2.328, "num_input_tokens_seen": 7600938432, "step": 14500 }, { "epoch": 0.13833072969459914, "eval_loss": 2.256176710128784, "eval_runtime": 82.1088, "eval_samples_per_second": 60.895, "eval_steps_per_second": 15.224, "num_input_tokens_seen": 7600938432, "step": 14500 }, { "epoch": 0.1388077322107874, "grad_norm": 0.12382518500089645, "learning_rate": 0.001, "loss": 2.3401, "num_input_tokens_seen": 7627147296, "step": 14550 }, { "epoch": 0.1392847347269757, "grad_norm": 0.13391022384166718, "learning_rate": 0.001, "loss": 2.3305, "num_input_tokens_seen": 7653361696, "step": 14600 }, { "epoch": 0.13976173724316396, "grad_norm": 0.14608611166477203, "learning_rate": 0.001, "loss": 2.3344, "num_input_tokens_seen": 7679565152, "step": 14650 }, { "epoch": 0.14023873975935222, "grad_norm": 0.1222352534532547, "learning_rate": 0.001, "loss": 2.3235, "num_input_tokens_seen": 7705765120, "step": 14700 }, { "epoch": 0.1407157422755405, "grad_norm": 0.12659655511379242, "learning_rate": 0.001, "loss": 2.335, "num_input_tokens_seen": 7731972128, "step": 14750 }, { "epoch": 0.14119274479172877, "grad_norm": 0.15103894472122192, "learning_rate": 0.001, "loss": 2.3439, "num_input_tokens_seen": 7758179904, "step": 14800 }, { "epoch": 0.14166974730791704, "grad_norm": 0.12366761267185211, "learning_rate": 0.001, "loss": 2.3447, "num_input_tokens_seen": 7784391104, "step": 14850 }, { "epoch": 0.14214674982410533, "grad_norm": 0.12323159724473953, "learning_rate": 0.001, "loss": 2.328, "num_input_tokens_seen": 7810605504, "step": 14900 }, { "epoch": 0.1426237523402936, "grad_norm": 0.13751116394996643, "learning_rate": 0.001, "loss": 2.3318, "num_input_tokens_seen": 7836816160, "step": 14950 }, { "epoch": 0.14310075485648185, "grad_norm": 0.1390126645565033, "learning_rate": 0.001, "loss": 2.3288, "num_input_tokens_seen": 7863022432, "step": 15000 }, { "epoch": 0.14310075485648185, "eval_loss": 2.2487399578094482, "eval_runtime": 81.911, "eval_samples_per_second": 61.042, "eval_steps_per_second": 15.26, "num_input_tokens_seen": 7863022432, "step": 15000 }, { "epoch": 0.14357775737267015, "grad_norm": 0.13024196028709412, "learning_rate": 0.001, "loss": 2.337, "num_input_tokens_seen": 7889232064, "step": 15050 }, { "epoch": 0.1440547598888584, "grad_norm": 0.13981671631336212, "learning_rate": 0.001, "loss": 2.33, "num_input_tokens_seen": 7915444032, "step": 15100 }, { "epoch": 0.1445317624050467, "grad_norm": 0.12976309657096863, "learning_rate": 0.001, "loss": 2.3276, "num_input_tokens_seen": 7941654272, "step": 15150 }, { "epoch": 0.14500876492123496, "grad_norm": 0.14406299591064453, "learning_rate": 0.001, "loss": 2.3245, "num_input_tokens_seen": 7967865216, "step": 15200 }, { "epoch": 0.14548576743742322, "grad_norm": 0.13180013000965118, "learning_rate": 0.001, "loss": 2.3251, "num_input_tokens_seen": 7994074080, "step": 15250 }, { "epoch": 0.14596276995361152, "grad_norm": 0.14100609719753265, "learning_rate": 0.001, "loss": 2.3342, "num_input_tokens_seen": 8020287168, "step": 15300 }, { "epoch": 0.14643977246979978, "grad_norm": 0.14573803544044495, "learning_rate": 0.001, "loss": 2.3251, "num_input_tokens_seen": 8046494176, "step": 15350 }, { "epoch": 0.14691677498598804, "grad_norm": 0.14260108768939972, "learning_rate": 0.001, "loss": 2.3391, "num_input_tokens_seen": 8072706720, "step": 15400 }, { "epoch": 0.14739377750217633, "grad_norm": 0.12735863029956818, "learning_rate": 0.001, "loss": 2.3285, "num_input_tokens_seen": 8098918432, "step": 15450 }, { "epoch": 0.1478707800183646, "grad_norm": 0.13214413821697235, "learning_rate": 0.001, "loss": 2.3259, "num_input_tokens_seen": 8125131456, "step": 15500 }, { "epoch": 0.1478707800183646, "eval_loss": 2.2429773807525635, "eval_runtime": 83.2826, "eval_samples_per_second": 60.037, "eval_steps_per_second": 15.009, "num_input_tokens_seen": 8125131456, "step": 15500 }, { "epoch": 0.14834778253455286, "grad_norm": 0.14493685960769653, "learning_rate": 0.001, "loss": 2.3191, "num_input_tokens_seen": 8151344032, "step": 15550 }, { "epoch": 0.14882478505074115, "grad_norm": 0.12741337716579437, "learning_rate": 0.001, "loss": 2.3341, "num_input_tokens_seen": 8177556096, "step": 15600 }, { "epoch": 0.1493017875669294, "grad_norm": 0.13515712320804596, "learning_rate": 0.001, "loss": 2.317, "num_input_tokens_seen": 8203769152, "step": 15650 }, { "epoch": 0.1497787900831177, "grad_norm": 0.1321142017841339, "learning_rate": 0.001, "loss": 2.318, "num_input_tokens_seen": 8229969312, "step": 15700 }, { "epoch": 0.15025579259930597, "grad_norm": 0.13010093569755554, "learning_rate": 0.001, "loss": 2.3291, "num_input_tokens_seen": 8256183456, "step": 15750 }, { "epoch": 0.15073279511549423, "grad_norm": 0.13135819137096405, "learning_rate": 0.001, "loss": 2.3304, "num_input_tokens_seen": 8282391392, "step": 15800 }, { "epoch": 0.15120979763168252, "grad_norm": 0.13832679390907288, "learning_rate": 0.001, "loss": 2.3206, "num_input_tokens_seen": 8308598656, "step": 15850 }, { "epoch": 0.15168680014787078, "grad_norm": 0.14133113622665405, "learning_rate": 0.001, "loss": 2.3201, "num_input_tokens_seen": 8334808960, "step": 15900 }, { "epoch": 0.15216380266405904, "grad_norm": 0.12465903908014297, "learning_rate": 0.001, "loss": 2.3235, "num_input_tokens_seen": 8361020256, "step": 15950 }, { "epoch": 0.15264080518024734, "grad_norm": 0.1318390965461731, "learning_rate": 0.001, "loss": 2.3103, "num_input_tokens_seen": 8387218560, "step": 16000 }, { "epoch": 0.15264080518024734, "eval_loss": 2.2365996837615967, "eval_runtime": 82.6838, "eval_samples_per_second": 60.471, "eval_steps_per_second": 15.118, "num_input_tokens_seen": 8387218560, "step": 16000 }, { "epoch": 0.1531178076964356, "grad_norm": 0.1479504108428955, "learning_rate": 0.001, "loss": 2.3222, "num_input_tokens_seen": 8413431680, "step": 16050 }, { "epoch": 0.15359481021262386, "grad_norm": 0.12534798681735992, "learning_rate": 0.001, "loss": 2.3064, "num_input_tokens_seen": 8439639584, "step": 16100 }, { "epoch": 0.15407181272881215, "grad_norm": 0.13538773357868195, "learning_rate": 0.001, "loss": 2.3156, "num_input_tokens_seen": 8465838816, "step": 16150 }, { "epoch": 0.15454881524500041, "grad_norm": 0.132590189576149, "learning_rate": 0.001, "loss": 2.3237, "num_input_tokens_seen": 8492046144, "step": 16200 }, { "epoch": 0.15502581776118868, "grad_norm": 0.15315937995910645, "learning_rate": 0.001, "loss": 2.3082, "num_input_tokens_seen": 8518256608, "step": 16250 }, { "epoch": 0.15550282027737697, "grad_norm": 0.14311794936656952, "learning_rate": 0.001, "loss": 2.3135, "num_input_tokens_seen": 8544471008, "step": 16300 }, { "epoch": 0.15597982279356523, "grad_norm": 0.13563624024391174, "learning_rate": 0.001, "loss": 2.319, "num_input_tokens_seen": 8570685408, "step": 16350 }, { "epoch": 0.15645682530975352, "grad_norm": 0.12712624669075012, "learning_rate": 0.001, "loss": 2.3216, "num_input_tokens_seen": 8596898464, "step": 16400 }, { "epoch": 0.15693382782594179, "grad_norm": 0.12751208245754242, "learning_rate": 0.001, "loss": 2.31, "num_input_tokens_seen": 8623111776, "step": 16450 }, { "epoch": 0.15741083034213005, "grad_norm": 0.1371571272611618, "learning_rate": 0.001, "loss": 2.3137, "num_input_tokens_seen": 8649321536, "step": 16500 }, { "epoch": 0.15741083034213005, "eval_loss": 2.232142210006714, "eval_runtime": 82.2631, "eval_samples_per_second": 60.781, "eval_steps_per_second": 15.195, "num_input_tokens_seen": 8649321536, "step": 16500 }, { "epoch": 0.15788783285831834, "grad_norm": 0.1343661993741989, "learning_rate": 0.001, "loss": 2.313, "num_input_tokens_seen": 8675529792, "step": 16550 }, { "epoch": 0.1583648353745066, "grad_norm": 0.14035946130752563, "learning_rate": 0.001, "loss": 2.3097, "num_input_tokens_seen": 8701739712, "step": 16600 }, { "epoch": 0.15884183789069486, "grad_norm": 0.12256618589162827, "learning_rate": 0.001, "loss": 2.3102, "num_input_tokens_seen": 8727951616, "step": 16650 }, { "epoch": 0.15931884040688316, "grad_norm": 0.1355251669883728, "learning_rate": 0.001, "loss": 2.3099, "num_input_tokens_seen": 8754160960, "step": 16700 }, { "epoch": 0.15979584292307142, "grad_norm": 0.13105979561805725, "learning_rate": 0.001, "loss": 2.3075, "num_input_tokens_seen": 8780369344, "step": 16750 }, { "epoch": 0.16027284543925968, "grad_norm": 0.13410349190235138, "learning_rate": 0.001, "loss": 2.3134, "num_input_tokens_seen": 8806583648, "step": 16800 }, { "epoch": 0.16074984795544797, "grad_norm": 0.13738510012626648, "learning_rate": 0.001, "loss": 2.3051, "num_input_tokens_seen": 8832796864, "step": 16850 }, { "epoch": 0.16122685047163623, "grad_norm": 0.13892224431037903, "learning_rate": 0.001, "loss": 2.3243, "num_input_tokens_seen": 8859005632, "step": 16900 }, { "epoch": 0.1617038529878245, "grad_norm": 0.12879416346549988, "learning_rate": 0.001, "loss": 2.3123, "num_input_tokens_seen": 8885216960, "step": 16950 }, { "epoch": 0.1621808555040128, "grad_norm": 0.1300731897354126, "learning_rate": 0.001, "loss": 2.3148, "num_input_tokens_seen": 8911431360, "step": 17000 }, { "epoch": 0.1621808555040128, "eval_loss": 2.2285797595977783, "eval_runtime": 82.7553, "eval_samples_per_second": 60.419, "eval_steps_per_second": 15.105, "num_input_tokens_seen": 8911431360, "step": 17000 }, { "epoch": 0.16265785802020105, "grad_norm": 0.13246452808380127, "learning_rate": 0.001, "loss": 2.3083, "num_input_tokens_seen": 8937641184, "step": 17050 }, { "epoch": 0.16313486053638934, "grad_norm": 0.1408887803554535, "learning_rate": 0.001, "loss": 2.311, "num_input_tokens_seen": 8963855584, "step": 17100 }, { "epoch": 0.1636118630525776, "grad_norm": 0.13497628271579742, "learning_rate": 0.001, "loss": 2.3075, "num_input_tokens_seen": 8990067520, "step": 17150 }, { "epoch": 0.16408886556876587, "grad_norm": 0.13361407816410065, "learning_rate": 0.001, "loss": 2.3048, "num_input_tokens_seen": 9016266240, "step": 17200 }, { "epoch": 0.16456586808495416, "grad_norm": 0.145442932844162, "learning_rate": 0.001, "loss": 2.3086, "num_input_tokens_seen": 9042480000, "step": 17250 }, { "epoch": 0.16504287060114242, "grad_norm": 0.12842726707458496, "learning_rate": 0.001, "loss": 2.3029, "num_input_tokens_seen": 9068694400, "step": 17300 }, { "epoch": 0.16551987311733068, "grad_norm": 0.14847566187381744, "learning_rate": 0.001, "loss": 2.3188, "num_input_tokens_seen": 9094902272, "step": 17350 }, { "epoch": 0.16599687563351898, "grad_norm": 0.13063114881515503, "learning_rate": 0.001, "loss": 2.297, "num_input_tokens_seen": 9121110464, "step": 17400 }, { "epoch": 0.16647387814970724, "grad_norm": 0.16154611110687256, "learning_rate": 0.001, "loss": 2.3122, "num_input_tokens_seen": 9147321632, "step": 17450 }, { "epoch": 0.1669508806658955, "grad_norm": 0.12539538741111755, "learning_rate": 0.001, "loss": 2.3076, "num_input_tokens_seen": 9173533056, "step": 17500 }, { "epoch": 0.1669508806658955, "eval_loss": 2.2225582599639893, "eval_runtime": 82.4465, "eval_samples_per_second": 60.645, "eval_steps_per_second": 15.161, "num_input_tokens_seen": 9173533056, "step": 17500 }, { "epoch": 0.1674278831820838, "grad_norm": 0.1455305516719818, "learning_rate": 0.001, "loss": 2.2964, "num_input_tokens_seen": 9199741376, "step": 17550 }, { "epoch": 0.16790488569827206, "grad_norm": 0.1348162442445755, "learning_rate": 0.001, "loss": 2.3055, "num_input_tokens_seen": 9225953984, "step": 17600 }, { "epoch": 0.16838188821446032, "grad_norm": 0.1430789977312088, "learning_rate": 0.001, "loss": 2.309, "num_input_tokens_seen": 9252159616, "step": 17650 }, { "epoch": 0.1688588907306486, "grad_norm": 0.14652392268180847, "learning_rate": 0.001, "loss": 2.3052, "num_input_tokens_seen": 9278371392, "step": 17700 }, { "epoch": 0.16933589324683687, "grad_norm": 0.13538667559623718, "learning_rate": 0.001, "loss": 2.3147, "num_input_tokens_seen": 9304572736, "step": 17750 }, { "epoch": 0.16981289576302516, "grad_norm": 0.13386596739292145, "learning_rate": 0.001, "loss": 2.3044, "num_input_tokens_seen": 9330787136, "step": 17800 }, { "epoch": 0.17028989827921343, "grad_norm": 0.1391988843679428, "learning_rate": 0.001, "loss": 2.2956, "num_input_tokens_seen": 9357001536, "step": 17850 }, { "epoch": 0.1707669007954017, "grad_norm": 0.13184039294719696, "learning_rate": 0.001, "loss": 2.2965, "num_input_tokens_seen": 9383215936, "step": 17900 }, { "epoch": 0.17124390331158998, "grad_norm": 0.14412756264209747, "learning_rate": 0.001, "loss": 2.292, "num_input_tokens_seen": 9409427392, "step": 17950 }, { "epoch": 0.17172090582777824, "grad_norm": 0.12889249622821808, "learning_rate": 0.001, "loss": 2.2963, "num_input_tokens_seen": 9435637536, "step": 18000 }, { "epoch": 0.17172090582777824, "eval_loss": 2.216590166091919, "eval_runtime": 82.2621, "eval_samples_per_second": 60.781, "eval_steps_per_second": 15.195, "num_input_tokens_seen": 9435637536, "step": 18000 }, { "epoch": 0.1721979083439665, "grad_norm": 0.1292746514081955, "learning_rate": 0.001, "loss": 2.3103, "num_input_tokens_seen": 9461834176, "step": 18050 }, { "epoch": 0.1726749108601548, "grad_norm": 0.13079434633255005, "learning_rate": 0.001, "loss": 2.3045, "num_input_tokens_seen": 9488041024, "step": 18100 }, { "epoch": 0.17315191337634306, "grad_norm": 0.1451425701379776, "learning_rate": 0.001, "loss": 2.3127, "num_input_tokens_seen": 9514248512, "step": 18150 }, { "epoch": 0.17362891589253132, "grad_norm": 0.14286376535892487, "learning_rate": 0.001, "loss": 2.296, "num_input_tokens_seen": 9540460992, "step": 18200 }, { "epoch": 0.1741059184087196, "grad_norm": 0.14793863892555237, "learning_rate": 0.001, "loss": 2.295, "num_input_tokens_seen": 9566675392, "step": 18250 }, { "epoch": 0.17458292092490788, "grad_norm": 0.13479390740394592, "learning_rate": 0.001, "loss": 2.2925, "num_input_tokens_seen": 9592885152, "step": 18300 }, { "epoch": 0.17505992344109614, "grad_norm": 0.14160257577896118, "learning_rate": 0.001, "loss": 2.2984, "num_input_tokens_seen": 9619098336, "step": 18350 }, { "epoch": 0.17553692595728443, "grad_norm": 0.1370360404253006, "learning_rate": 0.001, "loss": 2.283, "num_input_tokens_seen": 9645312736, "step": 18400 }, { "epoch": 0.1760139284734727, "grad_norm": 0.13573038578033447, "learning_rate": 0.001, "loss": 2.2902, "num_input_tokens_seen": 9671524352, "step": 18450 }, { "epoch": 0.17649093098966098, "grad_norm": 0.14134661853313446, "learning_rate": 0.001, "loss": 2.3052, "num_input_tokens_seen": 9697738752, "step": 18500 }, { "epoch": 0.17649093098966098, "eval_loss": 2.2122554779052734, "eval_runtime": 83.2223, "eval_samples_per_second": 60.08, "eval_steps_per_second": 15.02, "num_input_tokens_seen": 9697738752, "step": 18500 }, { "epoch": 0.17696793350584925, "grad_norm": 0.13307662308216095, "learning_rate": 0.001, "loss": 2.2964, "num_input_tokens_seen": 9723948800, "step": 18550 }, { "epoch": 0.1774449360220375, "grad_norm": 0.14741794764995575, "learning_rate": 0.001, "loss": 2.2947, "num_input_tokens_seen": 9750163200, "step": 18600 }, { "epoch": 0.1779219385382258, "grad_norm": 0.1431114673614502, "learning_rate": 0.001, "loss": 2.299, "num_input_tokens_seen": 9776369696, "step": 18650 }, { "epoch": 0.17839894105441406, "grad_norm": 0.1539929211139679, "learning_rate": 0.001, "loss": 2.2949, "num_input_tokens_seen": 9802580192, "step": 18700 }, { "epoch": 0.17887594357060232, "grad_norm": 0.13433188199996948, "learning_rate": 0.001, "loss": 2.2964, "num_input_tokens_seen": 9828792608, "step": 18750 }, { "epoch": 0.17935294608679062, "grad_norm": 0.12964121997356415, "learning_rate": 0.001, "loss": 2.2981, "num_input_tokens_seen": 9855007008, "step": 18800 }, { "epoch": 0.17982994860297888, "grad_norm": 0.1349261850118637, "learning_rate": 0.001, "loss": 2.2875, "num_input_tokens_seen": 9881218528, "step": 18850 }, { "epoch": 0.18030695111916714, "grad_norm": 0.12905199825763702, "learning_rate": 0.001, "loss": 2.2973, "num_input_tokens_seen": 9907428192, "step": 18900 }, { "epoch": 0.18078395363535543, "grad_norm": 0.13705725967884064, "learning_rate": 0.001, "loss": 2.2936, "num_input_tokens_seen": 9933638112, "step": 18950 }, { "epoch": 0.1812609561515437, "grad_norm": 0.13736732304096222, "learning_rate": 0.001, "loss": 2.2941, "num_input_tokens_seen": 9959851776, "step": 19000 }, { "epoch": 0.1812609561515437, "eval_loss": 2.2090442180633545, "eval_runtime": 82.3122, "eval_samples_per_second": 60.744, "eval_steps_per_second": 15.186, "num_input_tokens_seen": 9959851776, "step": 19000 }, { "epoch": 0.18173795866773199, "grad_norm": 0.14507094025611877, "learning_rate": 0.001, "loss": 2.2873, "num_input_tokens_seen": 9986063136, "step": 19050 }, { "epoch": 0.18221496118392025, "grad_norm": 0.14904463291168213, "learning_rate": 0.001, "loss": 2.2857, "num_input_tokens_seen": 10012277344, "step": 19100 }, { "epoch": 0.1826919637001085, "grad_norm": 0.1437740921974182, "learning_rate": 0.001, "loss": 2.293, "num_input_tokens_seen": 10038487648, "step": 19150 }, { "epoch": 0.1831689662162968, "grad_norm": 0.13508464395999908, "learning_rate": 0.001, "loss": 2.2888, "num_input_tokens_seen": 10064701312, "step": 19200 }, { "epoch": 0.18364596873248507, "grad_norm": 0.1317240297794342, "learning_rate": 0.001, "loss": 2.2916, "num_input_tokens_seen": 10090910496, "step": 19250 }, { "epoch": 0.18412297124867333, "grad_norm": 0.13427771627902985, "learning_rate": 0.001, "loss": 2.2861, "num_input_tokens_seen": 10117124896, "step": 19300 }, { "epoch": 0.18459997376486162, "grad_norm": 0.17408016324043274, "learning_rate": 0.001, "loss": 2.2826, "num_input_tokens_seen": 10143339296, "step": 19350 }, { "epoch": 0.18507697628104988, "grad_norm": 0.15968067944049835, "learning_rate": 0.001, "loss": 2.3486, "num_input_tokens_seen": 10169540896, "step": 19400 }, { "epoch": 0.18555397879723814, "grad_norm": 0.12174613028764725, "learning_rate": 0.001, "loss": 2.2963, "num_input_tokens_seen": 10195751040, "step": 19450 }, { "epoch": 0.18603098131342644, "grad_norm": 0.1349005550146103, "learning_rate": 0.001, "loss": 2.2888, "num_input_tokens_seen": 10221963136, "step": 19500 }, { "epoch": 0.18603098131342644, "eval_loss": 2.2117698192596436, "eval_runtime": 81.7726, "eval_samples_per_second": 61.145, "eval_steps_per_second": 15.286, "num_input_tokens_seen": 10221963136, "step": 19500 }, { "epoch": 0.1865079838296147, "grad_norm": 0.12541209161281586, "learning_rate": 0.001, "loss": 2.2966, "num_input_tokens_seen": 10248173024, "step": 19550 }, { "epoch": 0.18698498634580296, "grad_norm": 0.9949402213096619, "learning_rate": 0.001, "loss": 2.2825, "num_input_tokens_seen": 10274379520, "step": 19600 }, { "epoch": 0.18746198886199125, "grad_norm": 0.13587036728858948, "learning_rate": 0.001, "loss": 2.2963, "num_input_tokens_seen": 10300591040, "step": 19650 }, { "epoch": 0.18793899137817952, "grad_norm": 0.14047515392303467, "learning_rate": 0.001, "loss": 2.2871, "num_input_tokens_seen": 10326800928, "step": 19700 }, { "epoch": 0.1884159938943678, "grad_norm": 0.13005691766738892, "learning_rate": 0.001, "loss": 2.3058, "num_input_tokens_seen": 10353009568, "step": 19750 }, { "epoch": 0.18889299641055607, "grad_norm": 0.13120286166667938, "learning_rate": 0.001, "loss": 2.2927, "num_input_tokens_seen": 10379210048, "step": 19800 }, { "epoch": 0.18936999892674433, "grad_norm": 0.14059720933437347, "learning_rate": 0.001, "loss": 2.2887, "num_input_tokens_seen": 10405422080, "step": 19850 }, { "epoch": 0.18984700144293262, "grad_norm": 0.13072331249713898, "learning_rate": 0.001, "loss": 2.2928, "num_input_tokens_seen": 10431635744, "step": 19900 }, { "epoch": 0.19032400395912089, "grad_norm": 0.14114826917648315, "learning_rate": 0.001, "loss": 2.284, "num_input_tokens_seen": 10457844768, "step": 19950 }, { "epoch": 0.19080100647530915, "grad_norm": 0.13289280235767365, "learning_rate": 0.001, "loss": 2.2894, "num_input_tokens_seen": 10484059168, "step": 20000 }, { "epoch": 0.19080100647530915, "eval_loss": 2.20172381401062, "eval_runtime": 82.0311, "eval_samples_per_second": 60.952, "eval_steps_per_second": 15.238, "num_input_tokens_seen": 10484059168, "step": 20000 }, { "epoch": 0.19127800899149744, "grad_norm": 0.14763779938220978, "learning_rate": 0.001, "loss": 2.2901, "num_input_tokens_seen": 10510273568, "step": 20050 }, { "epoch": 0.1917550115076857, "grad_norm": 0.13675181567668915, "learning_rate": 0.001, "loss": 2.2809, "num_input_tokens_seen": 10536486432, "step": 20100 }, { "epoch": 0.19223201402387396, "grad_norm": 0.13765814900398254, "learning_rate": 0.001, "loss": 2.289, "num_input_tokens_seen": 10562700832, "step": 20150 }, { "epoch": 0.19270901654006226, "grad_norm": 0.1395033895969391, "learning_rate": 0.001, "loss": 2.286, "num_input_tokens_seen": 10588891776, "step": 20200 }, { "epoch": 0.19318601905625052, "grad_norm": 0.14209134876728058, "learning_rate": 0.001, "loss": 2.2805, "num_input_tokens_seen": 10615106176, "step": 20250 }, { "epoch": 0.19366302157243878, "grad_norm": 0.1354246586561203, "learning_rate": 0.001, "loss": 2.2817, "num_input_tokens_seen": 10641312192, "step": 20300 }, { "epoch": 0.19414002408862707, "grad_norm": 0.1305360049009323, "learning_rate": 0.001, "loss": 2.2871, "num_input_tokens_seen": 10667526080, "step": 20350 }, { "epoch": 0.19461702660481534, "grad_norm": 0.13948604464530945, "learning_rate": 0.001, "loss": 2.2841, "num_input_tokens_seen": 10693737664, "step": 20400 }, { "epoch": 0.19509402912100363, "grad_norm": 0.12424025684595108, "learning_rate": 0.001, "loss": 2.2838, "num_input_tokens_seen": 10719951328, "step": 20450 }, { "epoch": 0.1955710316371919, "grad_norm": 0.14923156797885895, "learning_rate": 0.001, "loss": 2.2882, "num_input_tokens_seen": 10746164768, "step": 20500 }, { "epoch": 0.1955710316371919, "eval_loss": 2.1973979473114014, "eval_runtime": 82.5785, "eval_samples_per_second": 60.548, "eval_steps_per_second": 15.137, "num_input_tokens_seen": 10746164768, "step": 20500 }, { "epoch": 0.19604803415338015, "grad_norm": 0.14104098081588745, "learning_rate": 0.001, "loss": 2.2842, "num_input_tokens_seen": 10772366272, "step": 20550 }, { "epoch": 0.19652503666956844, "grad_norm": 0.1297464370727539, "learning_rate": 0.001, "loss": 2.2835, "num_input_tokens_seen": 10798576992, "step": 20600 }, { "epoch": 0.1970020391857567, "grad_norm": 0.1436595320701599, "learning_rate": 0.001, "loss": 2.2744, "num_input_tokens_seen": 10824786016, "step": 20650 }, { "epoch": 0.19747904170194497, "grad_norm": 0.14249320328235626, "learning_rate": 0.001, "loss": 2.2823, "num_input_tokens_seen": 10850990816, "step": 20700 }, { "epoch": 0.19795604421813326, "grad_norm": 0.14356642961502075, "learning_rate": 0.001, "loss": 2.2891, "num_input_tokens_seen": 10877198080, "step": 20750 }, { "epoch": 0.19843304673432152, "grad_norm": 0.13429990410804749, "learning_rate": 0.001, "loss": 2.2786, "num_input_tokens_seen": 10903412480, "step": 20800 }, { "epoch": 0.19891004925050979, "grad_norm": 0.1445857435464859, "learning_rate": 0.001, "loss": 2.2761, "num_input_tokens_seen": 10929623488, "step": 20850 }, { "epoch": 0.19938705176669808, "grad_norm": 0.13351799547672272, "learning_rate": 0.001, "loss": 2.2801, "num_input_tokens_seen": 10955835264, "step": 20900 }, { "epoch": 0.19986405428288634, "grad_norm": 0.13249842822551727, "learning_rate": 0.001, "loss": 2.2807, "num_input_tokens_seen": 10982046176, "step": 20950 }, { "epoch": 0.2003410567990746, "grad_norm": 0.12836948037147522, "learning_rate": 0.001, "loss": 2.2677, "num_input_tokens_seen": 11008255872, "step": 21000 }, { "epoch": 0.2003410567990746, "eval_loss": 2.1926751136779785, "eval_runtime": 82.0402, "eval_samples_per_second": 60.946, "eval_steps_per_second": 15.236, "num_input_tokens_seen": 11008255872, "step": 21000 }, { "epoch": 0.2008180593152629, "grad_norm": 0.1373811513185501, "learning_rate": 0.001, "loss": 2.2797, "num_input_tokens_seen": 11034461376, "step": 21050 }, { "epoch": 0.20129506183145116, "grad_norm": 0.130074143409729, "learning_rate": 0.001, "loss": 2.2607, "num_input_tokens_seen": 11060670400, "step": 21100 }, { "epoch": 0.20177206434763945, "grad_norm": 0.13792483508586884, "learning_rate": 0.001, "loss": 2.2676, "num_input_tokens_seen": 11086880960, "step": 21150 }, { "epoch": 0.2022490668638277, "grad_norm": 0.1272813379764557, "learning_rate": 0.001, "loss": 2.2728, "num_input_tokens_seen": 11113093024, "step": 21200 }, { "epoch": 0.20272606938001597, "grad_norm": 0.1411881297826767, "learning_rate": 0.001, "loss": 2.2725, "num_input_tokens_seen": 11139305248, "step": 21250 }, { "epoch": 0.20320307189620426, "grad_norm": 0.15611988306045532, "learning_rate": 0.001, "loss": 2.2689, "num_input_tokens_seen": 11165511296, "step": 21300 }, { "epoch": 0.20368007441239253, "grad_norm": 0.13627928495407104, "learning_rate": 0.001, "loss": 2.2721, "num_input_tokens_seen": 11191723008, "step": 21350 }, { "epoch": 0.2041570769285808, "grad_norm": 0.14451804757118225, "learning_rate": 0.001, "loss": 2.2693, "num_input_tokens_seen": 11217937408, "step": 21400 }, { "epoch": 0.20463407944476908, "grad_norm": 0.1419762820005417, "learning_rate": 0.001, "loss": 2.2654, "num_input_tokens_seen": 11244151808, "step": 21450 }, { "epoch": 0.20511108196095734, "grad_norm": 0.139862060546875, "learning_rate": 0.001, "loss": 2.2577, "num_input_tokens_seen": 11270362240, "step": 21500 }, { "epoch": 0.20511108196095734, "eval_loss": 2.190119981765747, "eval_runtime": 82.6744, "eval_samples_per_second": 60.478, "eval_steps_per_second": 15.12, "num_input_tokens_seen": 11270362240, "step": 21500 }, { "epoch": 0.2055880844771456, "grad_norm": 0.13659726083278656, "learning_rate": 0.001, "loss": 2.273, "num_input_tokens_seen": 11296576640, "step": 21550 }, { "epoch": 0.2060650869933339, "grad_norm": 0.12730096280574799, "learning_rate": 0.001, "loss": 2.2651, "num_input_tokens_seen": 11322782720, "step": 21600 }, { "epoch": 0.20654208950952216, "grad_norm": 0.1489386260509491, "learning_rate": 0.001, "loss": 2.279, "num_input_tokens_seen": 11348986624, "step": 21650 }, { "epoch": 0.20701909202571045, "grad_norm": 0.13576173782348633, "learning_rate": 0.001, "loss": 2.2604, "num_input_tokens_seen": 11375197504, "step": 21700 }, { "epoch": 0.2074960945418987, "grad_norm": 0.15627992153167725, "learning_rate": 0.001, "loss": 2.2675, "num_input_tokens_seen": 11401405824, "step": 21750 }, { "epoch": 0.20797309705808698, "grad_norm": 0.14521074295043945, "learning_rate": 0.001, "loss": 2.2696, "num_input_tokens_seen": 11427616352, "step": 21800 }, { "epoch": 0.20845009957427527, "grad_norm": 0.15713635087013245, "learning_rate": 0.001, "loss": 2.2763, "num_input_tokens_seen": 11453820544, "step": 21850 }, { "epoch": 0.20892710209046353, "grad_norm": 0.15573829412460327, "learning_rate": 0.001, "loss": 2.2675, "num_input_tokens_seen": 11480031968, "step": 21900 }, { "epoch": 0.2094041046066518, "grad_norm": 0.1381770819425583, "learning_rate": 0.001, "loss": 2.2698, "num_input_tokens_seen": 11506246368, "step": 21950 }, { "epoch": 0.20988110712284008, "grad_norm": 0.17163416743278503, "learning_rate": 0.001, "loss": 2.2725, "num_input_tokens_seen": 11532457408, "step": 22000 }, { "epoch": 0.20988110712284008, "eval_loss": 2.1856114864349365, "eval_runtime": 82.4539, "eval_samples_per_second": 60.64, "eval_steps_per_second": 15.16, "num_input_tokens_seen": 11532457408, "step": 22000 }, { "epoch": 0.21035810963902835, "grad_norm": 0.13742762804031372, "learning_rate": 0.001, "loss": 2.2819, "num_input_tokens_seen": 11558665024, "step": 22050 }, { "epoch": 0.2108351121552166, "grad_norm": 0.1606198400259018, "learning_rate": 0.001, "loss": 2.2809, "num_input_tokens_seen": 11584879424, "step": 22100 }, { "epoch": 0.2113121146714049, "grad_norm": 0.1447242647409439, "learning_rate": 0.001, "loss": 2.272, "num_input_tokens_seen": 11611093824, "step": 22150 }, { "epoch": 0.21178911718759316, "grad_norm": 0.14127366244792938, "learning_rate": 0.001, "loss": 2.2605, "num_input_tokens_seen": 11637306304, "step": 22200 }, { "epoch": 0.21226611970378143, "grad_norm": 0.13236087560653687, "learning_rate": 0.001, "loss": 2.2676, "num_input_tokens_seen": 11663514912, "step": 22250 }, { "epoch": 0.21274312221996972, "grad_norm": 0.131170392036438, "learning_rate": 0.001, "loss": 2.2573, "num_input_tokens_seen": 11689716960, "step": 22300 }, { "epoch": 0.21322012473615798, "grad_norm": 0.16254200041294098, "learning_rate": 0.001, "loss": 2.252, "num_input_tokens_seen": 11715924704, "step": 22350 }, { "epoch": 0.21369712725234627, "grad_norm": 0.14250585436820984, "learning_rate": 0.001, "loss": 2.2739, "num_input_tokens_seen": 11742135840, "step": 22400 }, { "epoch": 0.21417412976853453, "grad_norm": 0.131143257021904, "learning_rate": 0.001, "loss": 2.2661, "num_input_tokens_seen": 11768347232, "step": 22450 }, { "epoch": 0.2146511322847228, "grad_norm": 0.13916635513305664, "learning_rate": 0.001, "loss": 2.2519, "num_input_tokens_seen": 11794558656, "step": 22500 }, { "epoch": 0.2146511322847228, "eval_loss": 2.183870315551758, "eval_runtime": 82.7383, "eval_samples_per_second": 60.431, "eval_steps_per_second": 15.108, "num_input_tokens_seen": 11794558656, "step": 22500 }, { "epoch": 0.21512813480091109, "grad_norm": 0.14609429240226746, "learning_rate": 0.001, "loss": 2.2588, "num_input_tokens_seen": 11820771552, "step": 22550 }, { "epoch": 0.21560513731709935, "grad_norm": 0.140402689576149, "learning_rate": 0.001, "loss": 2.2826, "num_input_tokens_seen": 11846982720, "step": 22600 }, { "epoch": 0.2160821398332876, "grad_norm": 0.14499905705451965, "learning_rate": 0.001, "loss": 2.2704, "num_input_tokens_seen": 11873196512, "step": 22650 }, { "epoch": 0.2165591423494759, "grad_norm": 0.14119970798492432, "learning_rate": 0.001, "loss": 2.2564, "num_input_tokens_seen": 11899404224, "step": 22700 }, { "epoch": 0.21703614486566417, "grad_norm": 0.13618482649326324, "learning_rate": 0.001, "loss": 2.2616, "num_input_tokens_seen": 11925615904, "step": 22750 }, { "epoch": 0.21751314738185243, "grad_norm": 0.15894031524658203, "learning_rate": 0.001, "loss": 2.2826, "num_input_tokens_seen": 11951821216, "step": 22800 }, { "epoch": 0.21799014989804072, "grad_norm": 0.13335183262825012, "learning_rate": 0.001, "loss": 2.2615, "num_input_tokens_seen": 11978025056, "step": 22850 }, { "epoch": 0.21846715241422898, "grad_norm": 0.1391170769929886, "learning_rate": 0.001, "loss": 2.2677, "num_input_tokens_seen": 12004238368, "step": 22900 }, { "epoch": 0.21894415493041725, "grad_norm": 0.14966392517089844, "learning_rate": 0.001, "loss": 2.2742, "num_input_tokens_seen": 12030450848, "step": 22950 }, { "epoch": 0.21942115744660554, "grad_norm": 0.15453237295150757, "learning_rate": 0.001, "loss": 2.266, "num_input_tokens_seen": 12056655104, "step": 23000 }, { "epoch": 0.21942115744660554, "eval_loss": 2.17928409576416, "eval_runtime": 82.7688, "eval_samples_per_second": 60.409, "eval_steps_per_second": 15.102, "num_input_tokens_seen": 12056655104, "step": 23000 }, { "epoch": 0.2198981599627938, "grad_norm": 0.1397433876991272, "learning_rate": 0.001, "loss": 2.2649, "num_input_tokens_seen": 12082858944, "step": 23050 }, { "epoch": 0.2203751624789821, "grad_norm": 0.13647589087486267, "learning_rate": 0.001, "loss": 2.2639, "num_input_tokens_seen": 12109067872, "step": 23100 }, { "epoch": 0.22085216499517035, "grad_norm": 0.1422584354877472, "learning_rate": 0.001, "loss": 2.2641, "num_input_tokens_seen": 12135282272, "step": 23150 }, { "epoch": 0.22132916751135862, "grad_norm": 0.14315859973430634, "learning_rate": 0.001, "loss": 2.2587, "num_input_tokens_seen": 12161491840, "step": 23200 }, { "epoch": 0.2218061700275469, "grad_norm": 0.14624252915382385, "learning_rate": 0.001, "loss": 2.2658, "num_input_tokens_seen": 12187700480, "step": 23250 }, { "epoch": 0.22228317254373517, "grad_norm": 0.14765731990337372, "learning_rate": 0.001, "loss": 2.263, "num_input_tokens_seen": 12213907680, "step": 23300 }, { "epoch": 0.22276017505992343, "grad_norm": 0.15279778838157654, "learning_rate": 0.001, "loss": 2.2529, "num_input_tokens_seen": 12240118848, "step": 23350 }, { "epoch": 0.22323717757611172, "grad_norm": 0.1480414867401123, "learning_rate": 0.001, "loss": 2.2545, "num_input_tokens_seen": 12266329376, "step": 23400 }, { "epoch": 0.22371418009229999, "grad_norm": 0.1284361481666565, "learning_rate": 0.001, "loss": 2.2659, "num_input_tokens_seen": 12292540960, "step": 23450 }, { "epoch": 0.22419118260848825, "grad_norm": 0.138748899102211, "learning_rate": 0.001, "loss": 2.2531, "num_input_tokens_seen": 12318747360, "step": 23500 }, { "epoch": 0.22419118260848825, "eval_loss": 2.17672061920166, "eval_runtime": 82.9579, "eval_samples_per_second": 60.272, "eval_steps_per_second": 15.068, "num_input_tokens_seen": 12318747360, "step": 23500 }, { "epoch": 0.22466818512467654, "grad_norm": 0.13704177737236023, "learning_rate": 0.001, "loss": 2.2553, "num_input_tokens_seen": 12344950880, "step": 23550 }, { "epoch": 0.2251451876408648, "grad_norm": 0.1447945088148117, "learning_rate": 0.001, "loss": 2.2516, "num_input_tokens_seen": 12371157184, "step": 23600 }, { "epoch": 0.22562219015705307, "grad_norm": 0.13667277991771698, "learning_rate": 0.001, "loss": 2.2556, "num_input_tokens_seen": 12397370368, "step": 23650 }, { "epoch": 0.22609919267324136, "grad_norm": 0.13712671399116516, "learning_rate": 0.001, "loss": 2.2511, "num_input_tokens_seen": 12423583616, "step": 23700 }, { "epoch": 0.22657619518942962, "grad_norm": 0.15262199938297272, "learning_rate": 0.001, "loss": 2.2545, "num_input_tokens_seen": 12449797024, "step": 23750 }, { "epoch": 0.2270531977056179, "grad_norm": 0.1370035856962204, "learning_rate": 0.001, "loss": 2.2558, "num_input_tokens_seen": 12476011424, "step": 23800 }, { "epoch": 0.22753020022180617, "grad_norm": 0.13982941210269928, "learning_rate": 0.001, "loss": 2.2334, "num_input_tokens_seen": 12502223744, "step": 23850 }, { "epoch": 0.22800720273799444, "grad_norm": 0.14523112773895264, "learning_rate": 0.001, "loss": 2.2536, "num_input_tokens_seen": 12528433728, "step": 23900 }, { "epoch": 0.22848420525418273, "grad_norm": 0.1419558823108673, "learning_rate": 0.001, "loss": 2.2568, "num_input_tokens_seen": 12554642496, "step": 23950 }, { "epoch": 0.228961207770371, "grad_norm": 0.1442372351884842, "learning_rate": 0.001, "loss": 2.2522, "num_input_tokens_seen": 12580853504, "step": 24000 }, { "epoch": 0.228961207770371, "eval_loss": 2.1732497215270996, "eval_runtime": 82.253, "eval_samples_per_second": 60.788, "eval_steps_per_second": 15.197, "num_input_tokens_seen": 12580853504, "step": 24000 }, { "epoch": 0.22943821028655925, "grad_norm": 0.13844448328018188, "learning_rate": 0.001, "loss": 2.2602, "num_input_tokens_seen": 12607066272, "step": 24050 }, { "epoch": 0.22991521280274754, "grad_norm": 0.14124740660190582, "learning_rate": 0.001, "loss": 2.2533, "num_input_tokens_seen": 12633279840, "step": 24100 }, { "epoch": 0.2303922153189358, "grad_norm": 0.136307492852211, "learning_rate": 0.001, "loss": 2.2483, "num_input_tokens_seen": 12659487104, "step": 24150 }, { "epoch": 0.23086921783512407, "grad_norm": 0.13790194690227509, "learning_rate": 0.001, "loss": 2.2511, "num_input_tokens_seen": 12685699648, "step": 24200 }, { "epoch": 0.23134622035131236, "grad_norm": 0.13985110819339752, "learning_rate": 0.001, "loss": 2.2637, "num_input_tokens_seen": 12711914048, "step": 24250 }, { "epoch": 0.23182322286750062, "grad_norm": 0.14229442179203033, "learning_rate": 0.001, "loss": 2.2615, "num_input_tokens_seen": 12738126208, "step": 24300 }, { "epoch": 0.23230022538368889, "grad_norm": 0.13444297015666962, "learning_rate": 0.001, "loss": 2.2568, "num_input_tokens_seen": 12764340608, "step": 24350 }, { "epoch": 0.23277722789987718, "grad_norm": 0.14222408831119537, "learning_rate": 0.001, "loss": 2.2579, "num_input_tokens_seen": 12790554080, "step": 24400 }, { "epoch": 0.23325423041606544, "grad_norm": 0.14746561646461487, "learning_rate": 0.001, "loss": 2.2524, "num_input_tokens_seen": 12816760928, "step": 24450 }, { "epoch": 0.23373123293225373, "grad_norm": 0.14593298733234406, "learning_rate": 0.001, "loss": 2.2604, "num_input_tokens_seen": 12842964128, "step": 24500 }, { "epoch": 0.23373123293225373, "eval_loss": 2.171039342880249, "eval_runtime": 82.0422, "eval_samples_per_second": 60.944, "eval_steps_per_second": 15.236, "num_input_tokens_seen": 12842964128, "step": 24500 }, { "epoch": 0.234208235448442, "grad_norm": 0.13651101291179657, "learning_rate": 0.001, "loss": 2.264, "num_input_tokens_seen": 12869178432, "step": 24550 }, { "epoch": 0.23468523796463026, "grad_norm": 0.15846236050128937, "learning_rate": 0.001, "loss": 2.2408, "num_input_tokens_seen": 12895391296, "step": 24600 }, { "epoch": 0.23516224048081855, "grad_norm": 0.16644498705863953, "learning_rate": 0.001, "loss": 2.2455, "num_input_tokens_seen": 12921601472, "step": 24650 }, { "epoch": 0.2356392429970068, "grad_norm": 0.14885085821151733, "learning_rate": 0.001, "loss": 2.261, "num_input_tokens_seen": 12947811040, "step": 24700 }, { "epoch": 0.23611624551319507, "grad_norm": 0.12761050462722778, "learning_rate": 0.001, "loss": 2.2534, "num_input_tokens_seen": 12974014752, "step": 24750 }, { "epoch": 0.23659324802938336, "grad_norm": 0.13764004409313202, "learning_rate": 0.001, "loss": 2.2592, "num_input_tokens_seen": 13000226080, "step": 24800 }, { "epoch": 0.23707025054557163, "grad_norm": 0.14264918863773346, "learning_rate": 0.001, "loss": 2.2482, "num_input_tokens_seen": 13026440480, "step": 24850 }, { "epoch": 0.2375472530617599, "grad_norm": 0.143757626414299, "learning_rate": 0.001, "loss": 2.2512, "num_input_tokens_seen": 13052650624, "step": 24900 }, { "epoch": 0.23802425557794818, "grad_norm": 0.157669335603714, "learning_rate": 0.001, "loss": 2.2597, "num_input_tokens_seen": 13078857312, "step": 24950 }, { "epoch": 0.23850125809413644, "grad_norm": 0.13242298364639282, "learning_rate": 0.001, "loss": 2.253, "num_input_tokens_seen": 13105069824, "step": 25000 }, { "epoch": 0.23850125809413644, "eval_loss": 2.1673171520233154, "eval_runtime": 82.9202, "eval_samples_per_second": 60.299, "eval_steps_per_second": 15.075, "num_input_tokens_seen": 13105069824, "step": 25000 }, { "epoch": 0.23897826061032473, "grad_norm": 0.13656990230083466, "learning_rate": 0.001, "loss": 2.2661, "num_input_tokens_seen": 13131284224, "step": 25050 }, { "epoch": 0.239455263126513, "grad_norm": 0.13822485506534576, "learning_rate": 0.001, "loss": 2.251, "num_input_tokens_seen": 13157497664, "step": 25100 }, { "epoch": 0.23993226564270126, "grad_norm": 0.13563229143619537, "learning_rate": 0.001, "loss": 2.2495, "num_input_tokens_seen": 13183708032, "step": 25150 }, { "epoch": 0.24040926815888955, "grad_norm": 0.1263655722141266, "learning_rate": 0.001, "loss": 2.2478, "num_input_tokens_seen": 13209920512, "step": 25200 }, { "epoch": 0.2408862706750778, "grad_norm": 0.14311367273330688, "learning_rate": 0.001, "loss": 2.2444, "num_input_tokens_seen": 13236127552, "step": 25250 }, { "epoch": 0.24136327319126608, "grad_norm": 0.14571504294872284, "learning_rate": 0.001, "loss": 2.2587, "num_input_tokens_seen": 13262329824, "step": 25300 }, { "epoch": 0.24184027570745437, "grad_norm": 0.16660790145397186, "learning_rate": 0.001, "loss": 2.2554, "num_input_tokens_seen": 13288536736, "step": 25350 }, { "epoch": 0.24231727822364263, "grad_norm": 0.14656688272953033, "learning_rate": 0.001, "loss": 2.2505, "num_input_tokens_seen": 13314751136, "step": 25400 }, { "epoch": 0.2427942807398309, "grad_norm": 0.14772988855838776, "learning_rate": 0.001, "loss": 2.2451, "num_input_tokens_seen": 13340963264, "step": 25450 }, { "epoch": 0.24327128325601918, "grad_norm": 0.15227681398391724, "learning_rate": 0.001, "loss": 2.2388, "num_input_tokens_seen": 13367175456, "step": 25500 }, { "epoch": 0.24327128325601918, "eval_loss": 2.165367364883423, "eval_runtime": 83.0673, "eval_samples_per_second": 60.192, "eval_steps_per_second": 15.048, "num_input_tokens_seen": 13367175456, "step": 25500 }, { "epoch": 0.24374828577220745, "grad_norm": 0.14786367118358612, "learning_rate": 0.001, "loss": 2.2416, "num_input_tokens_seen": 13393384960, "step": 25550 }, { "epoch": 0.2442252882883957, "grad_norm": 0.1325492560863495, "learning_rate": 0.001, "loss": 2.2425, "num_input_tokens_seen": 13419599360, "step": 25600 }, { "epoch": 0.244702290804584, "grad_norm": 0.14455124735832214, "learning_rate": 0.001, "loss": 2.235, "num_input_tokens_seen": 13445800960, "step": 25650 }, { "epoch": 0.24517929332077226, "grad_norm": 0.14452672004699707, "learning_rate": 0.001, "loss": 2.2436, "num_input_tokens_seen": 13472012768, "step": 25700 }, { "epoch": 0.24565629583696055, "grad_norm": 0.13266603648662567, "learning_rate": 0.001, "loss": 2.2406, "num_input_tokens_seen": 13498221376, "step": 25750 }, { "epoch": 0.24613329835314882, "grad_norm": 0.14916899800300598, "learning_rate": 0.001, "loss": 2.2432, "num_input_tokens_seen": 13524435776, "step": 25800 }, { "epoch": 0.24661030086933708, "grad_norm": 0.12612730264663696, "learning_rate": 0.001, "loss": 2.2415, "num_input_tokens_seen": 13550647872, "step": 25850 }, { "epoch": 0.24708730338552537, "grad_norm": 0.1731935441493988, "learning_rate": 0.001, "loss": 2.2427, "num_input_tokens_seen": 13576851328, "step": 25900 }, { "epoch": 0.24756430590171363, "grad_norm": 0.14770366251468658, "learning_rate": 0.001, "loss": 2.2543, "num_input_tokens_seen": 13603058656, "step": 25950 }, { "epoch": 0.2480413084179019, "grad_norm": 0.141856387257576, "learning_rate": 0.001, "loss": 2.2511, "num_input_tokens_seen": 13629260960, "step": 26000 }, { "epoch": 0.2480413084179019, "eval_loss": 2.1629014015197754, "eval_runtime": 82.2057, "eval_samples_per_second": 60.823, "eval_steps_per_second": 15.206, "num_input_tokens_seen": 13629260960, "step": 26000 }, { "epoch": 0.2485183109340902, "grad_norm": 0.15201644599437714, "learning_rate": 0.001, "loss": 2.2435, "num_input_tokens_seen": 13655473920, "step": 26050 }, { "epoch": 0.24899531345027845, "grad_norm": 0.13137054443359375, "learning_rate": 0.001, "loss": 2.2433, "num_input_tokens_seen": 13681688320, "step": 26100 }, { "epoch": 0.2494723159664667, "grad_norm": 0.12851624190807343, "learning_rate": 0.001, "loss": 2.248, "num_input_tokens_seen": 13707897792, "step": 26150 }, { "epoch": 0.249949318482655, "grad_norm": 0.14121095836162567, "learning_rate": 0.001, "loss": 2.2385, "num_input_tokens_seen": 13734107712, "step": 26200 }, { "epoch": 0.25042632099884327, "grad_norm": 0.14826469123363495, "learning_rate": 0.001, "loss": 2.242, "num_input_tokens_seen": 13760320736, "step": 26250 }, { "epoch": 0.25090332351503153, "grad_norm": 0.1528318077325821, "learning_rate": 0.001, "loss": 2.2419, "num_input_tokens_seen": 13786534208, "step": 26300 }, { "epoch": 0.2513803260312198, "grad_norm": 0.15646971762180328, "learning_rate": 0.001, "loss": 2.2407, "num_input_tokens_seen": 13812743072, "step": 26350 }, { "epoch": 0.2518573285474081, "grad_norm": 0.13336172699928284, "learning_rate": 0.001, "loss": 2.2483, "num_input_tokens_seen": 13838957088, "step": 26400 }, { "epoch": 0.2523343310635964, "grad_norm": 0.13618668913841248, "learning_rate": 0.001, "loss": 2.2347, "num_input_tokens_seen": 13865167008, "step": 26450 }, { "epoch": 0.25281133357978464, "grad_norm": 0.14507335424423218, "learning_rate": 0.001, "loss": 2.2453, "num_input_tokens_seen": 13891381408, "step": 26500 }, { "epoch": 0.25281133357978464, "eval_loss": 2.159205675125122, "eval_runtime": 82.4369, "eval_samples_per_second": 60.652, "eval_steps_per_second": 15.163, "num_input_tokens_seen": 13891381408, "step": 26500 }, { "epoch": 0.2532883360959729, "grad_norm": 0.15305021405220032, "learning_rate": 0.001, "loss": 2.2366, "num_input_tokens_seen": 13917593408, "step": 26550 }, { "epoch": 0.25376533861216116, "grad_norm": 0.1411104053258896, "learning_rate": 0.001, "loss": 2.237, "num_input_tokens_seen": 13943805696, "step": 26600 }, { "epoch": 0.2542423411283494, "grad_norm": 0.13980819284915924, "learning_rate": 0.001, "loss": 2.2511, "num_input_tokens_seen": 13970020096, "step": 26650 }, { "epoch": 0.25471934364453774, "grad_norm": 0.14256146550178528, "learning_rate": 0.001, "loss": 2.233, "num_input_tokens_seen": 13996232832, "step": 26700 }, { "epoch": 0.255196346160726, "grad_norm": 0.1522730439901352, "learning_rate": 0.001, "loss": 2.2433, "num_input_tokens_seen": 14022447232, "step": 26750 }, { "epoch": 0.25567334867691427, "grad_norm": 0.14082644879817963, "learning_rate": 0.001, "loss": 2.2481, "num_input_tokens_seen": 14048655424, "step": 26800 }, { "epoch": 0.25615035119310253, "grad_norm": 0.14439330995082855, "learning_rate": 0.001, "loss": 2.2418, "num_input_tokens_seen": 14074868960, "step": 26850 }, { "epoch": 0.2566273537092908, "grad_norm": 0.15122254192829132, "learning_rate": 0.001, "loss": 2.2237, "num_input_tokens_seen": 14101083360, "step": 26900 }, { "epoch": 0.2571043562254791, "grad_norm": 0.14002810418605804, "learning_rate": 0.001, "loss": 2.2446, "num_input_tokens_seen": 14127297760, "step": 26950 }, { "epoch": 0.2575813587416674, "grad_norm": 0.135335773229599, "learning_rate": 0.001, "loss": 2.2302, "num_input_tokens_seen": 14153506688, "step": 27000 }, { "epoch": 0.2575813587416674, "eval_loss": 2.1568057537078857, "eval_runtime": 82.6565, "eval_samples_per_second": 60.491, "eval_steps_per_second": 15.123, "num_input_tokens_seen": 14153506688, "step": 27000 }, { "epoch": 0.25805836125785564, "grad_norm": 0.1621844619512558, "learning_rate": 0.001, "loss": 2.2378, "num_input_tokens_seen": 14179709120, "step": 27050 }, { "epoch": 0.2585353637740439, "grad_norm": 0.1400565207004547, "learning_rate": 0.001, "loss": 2.2416, "num_input_tokens_seen": 14205917376, "step": 27100 }, { "epoch": 0.25901236629023217, "grad_norm": 0.1439099758863449, "learning_rate": 0.001, "loss": 2.2443, "num_input_tokens_seen": 14232130048, "step": 27150 }, { "epoch": 0.25948936880642043, "grad_norm": 0.1417345255613327, "learning_rate": 0.001, "loss": 2.2416, "num_input_tokens_seen": 14258342176, "step": 27200 }, { "epoch": 0.25996637132260875, "grad_norm": 0.14503255486488342, "learning_rate": 0.001, "loss": 2.2363, "num_input_tokens_seen": 14284552192, "step": 27250 }, { "epoch": 0.260443373838797, "grad_norm": 0.14273668825626373, "learning_rate": 0.001, "loss": 2.2321, "num_input_tokens_seen": 14310756928, "step": 27300 }, { "epoch": 0.2609203763549853, "grad_norm": 0.14368127286434174, "learning_rate": 0.001, "loss": 2.2442, "num_input_tokens_seen": 14336966336, "step": 27350 }, { "epoch": 0.26139737887117354, "grad_norm": 0.13848727941513062, "learning_rate": 0.001, "loss": 2.2256, "num_input_tokens_seen": 14363168832, "step": 27400 }, { "epoch": 0.2618743813873618, "grad_norm": 0.13322456181049347, "learning_rate": 0.001, "loss": 2.2422, "num_input_tokens_seen": 14389376864, "step": 27450 }, { "epoch": 0.2623513839035501, "grad_norm": 0.14387381076812744, "learning_rate": 0.001, "loss": 2.2305, "num_input_tokens_seen": 14415584288, "step": 27500 }, { "epoch": 0.2623513839035501, "eval_loss": 2.1539554595947266, "eval_runtime": 82.2364, "eval_samples_per_second": 60.8, "eval_steps_per_second": 15.2, "num_input_tokens_seen": 14415584288, "step": 27500 }, { "epoch": 0.2628283864197384, "grad_norm": 0.1330934762954712, "learning_rate": 0.001, "loss": 2.2403, "num_input_tokens_seen": 14441793888, "step": 27550 }, { "epoch": 0.26330538893592664, "grad_norm": 0.1436392068862915, "learning_rate": 0.001, "loss": 2.2378, "num_input_tokens_seen": 14468003360, "step": 27600 }, { "epoch": 0.2637823914521149, "grad_norm": 0.14959146082401276, "learning_rate": 0.001, "loss": 2.2322, "num_input_tokens_seen": 14494214688, "step": 27650 }, { "epoch": 0.26425939396830317, "grad_norm": 0.13732261955738068, "learning_rate": 0.001, "loss": 2.2409, "num_input_tokens_seen": 14520428096, "step": 27700 }, { "epoch": 0.26473639648449143, "grad_norm": 0.14365635812282562, "learning_rate": 0.001, "loss": 2.2309, "num_input_tokens_seen": 14546632320, "step": 27750 }, { "epoch": 0.26521339900067975, "grad_norm": 0.14219102263450623, "learning_rate": 0.001, "loss": 2.2301, "num_input_tokens_seen": 14572845824, "step": 27800 }, { "epoch": 0.265690401516868, "grad_norm": 0.12978766858577728, "learning_rate": 0.001, "loss": 2.2338, "num_input_tokens_seen": 14599059104, "step": 27850 }, { "epoch": 0.2661674040330563, "grad_norm": 0.14842823147773743, "learning_rate": 0.001, "loss": 2.2341, "num_input_tokens_seen": 14625272352, "step": 27900 }, { "epoch": 0.26664440654924454, "grad_norm": 0.15755997598171234, "learning_rate": 0.001, "loss": 2.2346, "num_input_tokens_seen": 14651486752, "step": 27950 }, { "epoch": 0.2671214090654328, "grad_norm": 0.15303048491477966, "learning_rate": 0.001, "loss": 2.2285, "num_input_tokens_seen": 14677696896, "step": 28000 }, { "epoch": 0.2671214090654328, "eval_loss": 2.1511332988739014, "eval_runtime": 82.4707, "eval_samples_per_second": 60.628, "eval_steps_per_second": 15.157, "num_input_tokens_seen": 14677696896, "step": 28000 }, { "epoch": 0.2675984115816211, "grad_norm": 0.14884154498577118, "learning_rate": 0.001, "loss": 2.2273, "num_input_tokens_seen": 14703910880, "step": 28050 }, { "epoch": 0.2680754140978094, "grad_norm": 0.15423212945461273, "learning_rate": 0.001, "loss": 2.2328, "num_input_tokens_seen": 14730125280, "step": 28100 }, { "epoch": 0.26855241661399765, "grad_norm": 0.14974601566791534, "learning_rate": 0.001, "loss": 2.2282, "num_input_tokens_seen": 14756332672, "step": 28150 }, { "epoch": 0.2690294191301859, "grad_norm": 0.13820908963680267, "learning_rate": 0.001, "loss": 2.2281, "num_input_tokens_seen": 14782539808, "step": 28200 }, { "epoch": 0.2695064216463742, "grad_norm": 0.142080619931221, "learning_rate": 0.001, "loss": 2.2375, "num_input_tokens_seen": 14808750880, "step": 28250 }, { "epoch": 0.26998342416256244, "grad_norm": 0.12440051883459091, "learning_rate": 0.001, "loss": 2.2495, "num_input_tokens_seen": 14834952704, "step": 28300 }, { "epoch": 0.27046042667875075, "grad_norm": 0.1336458921432495, "learning_rate": 0.001, "loss": 2.2379, "num_input_tokens_seen": 14861154080, "step": 28350 }, { "epoch": 0.270937429194939, "grad_norm": 0.13392235338687897, "learning_rate": 0.001, "loss": 2.2309, "num_input_tokens_seen": 14887355328, "step": 28400 }, { "epoch": 0.2714144317111273, "grad_norm": 0.14661704003810883, "learning_rate": 0.001, "loss": 2.2339, "num_input_tokens_seen": 14913569312, "step": 28450 }, { "epoch": 0.27189143422731554, "grad_norm": 0.14747242629528046, "learning_rate": 0.001, "loss": 2.23, "num_input_tokens_seen": 14939780320, "step": 28500 }, { "epoch": 0.27189143422731554, "eval_loss": 2.1497859954833984, "eval_runtime": 82.2811, "eval_samples_per_second": 60.767, "eval_steps_per_second": 15.192, "num_input_tokens_seen": 14939780320, "step": 28500 }, { "epoch": 0.2723684367435038, "grad_norm": 0.14154289662837982, "learning_rate": 0.001, "loss": 2.2335, "num_input_tokens_seen": 14965993216, "step": 28550 }, { "epoch": 0.27284543925969207, "grad_norm": 0.13375169038772583, "learning_rate": 0.001, "loss": 2.2336, "num_input_tokens_seen": 14992207616, "step": 28600 }, { "epoch": 0.2733224417758804, "grad_norm": 0.13693657517433167, "learning_rate": 0.001, "loss": 2.2406, "num_input_tokens_seen": 15018421472, "step": 28650 }, { "epoch": 0.27379944429206865, "grad_norm": 0.14101336896419525, "learning_rate": 0.001, "loss": 2.226, "num_input_tokens_seen": 15044631584, "step": 28700 }, { "epoch": 0.2742764468082569, "grad_norm": 0.1408209353685379, "learning_rate": 0.001, "loss": 2.2241, "num_input_tokens_seen": 15070842272, "step": 28750 }, { "epoch": 0.2747534493244452, "grad_norm": 0.13644535839557648, "learning_rate": 0.001, "loss": 2.234, "num_input_tokens_seen": 15097056032, "step": 28800 }, { "epoch": 0.27523045184063344, "grad_norm": 0.15270450711250305, "learning_rate": 0.001, "loss": 2.2408, "num_input_tokens_seen": 15123268096, "step": 28850 }, { "epoch": 0.27570745435682176, "grad_norm": 0.1514887809753418, "learning_rate": 0.001, "loss": 2.2306, "num_input_tokens_seen": 15149474304, "step": 28900 }, { "epoch": 0.27618445687301, "grad_norm": 0.13505037128925323, "learning_rate": 0.001, "loss": 2.2321, "num_input_tokens_seen": 15175685536, "step": 28950 }, { "epoch": 0.2766614593891983, "grad_norm": 0.1479782909154892, "learning_rate": 0.001, "loss": 2.2136, "num_input_tokens_seen": 15201894176, "step": 29000 }, { "epoch": 0.2766614593891983, "eval_loss": 2.147918701171875, "eval_runtime": 82.3338, "eval_samples_per_second": 60.728, "eval_steps_per_second": 15.182, "num_input_tokens_seen": 15201894176, "step": 29000 }, { "epoch": 0.27713846190538655, "grad_norm": 0.15296803414821625, "learning_rate": 0.001, "loss": 2.2364, "num_input_tokens_seen": 15228104928, "step": 29050 }, { "epoch": 0.2776154644215748, "grad_norm": 0.13571250438690186, "learning_rate": 0.001, "loss": 2.2435, "num_input_tokens_seen": 15254317056, "step": 29100 }, { "epoch": 0.27809246693776307, "grad_norm": 0.13723242282867432, "learning_rate": 0.001, "loss": 2.229, "num_input_tokens_seen": 15280525888, "step": 29150 }, { "epoch": 0.2785694694539514, "grad_norm": 0.14391474425792694, "learning_rate": 0.001, "loss": 2.2304, "num_input_tokens_seen": 15306733696, "step": 29200 }, { "epoch": 0.27904647197013965, "grad_norm": 0.14517651498317719, "learning_rate": 0.001, "loss": 2.2281, "num_input_tokens_seen": 15332940096, "step": 29250 }, { "epoch": 0.2795234744863279, "grad_norm": 0.15248551964759827, "learning_rate": 0.001, "loss": 2.2329, "num_input_tokens_seen": 15359147872, "step": 29300 }, { "epoch": 0.2800004770025162, "grad_norm": 0.13358819484710693, "learning_rate": 0.001, "loss": 2.2321, "num_input_tokens_seen": 15385357856, "step": 29350 }, { "epoch": 0.28047747951870444, "grad_norm": 0.13603374361991882, "learning_rate": 0.001, "loss": 2.2251, "num_input_tokens_seen": 15411568224, "step": 29400 }, { "epoch": 0.28095448203489276, "grad_norm": 0.13729465007781982, "learning_rate": 0.001, "loss": 2.2238, "num_input_tokens_seen": 15437777056, "step": 29450 }, { "epoch": 0.281431484551081, "grad_norm": 0.14745451509952545, "learning_rate": 0.001, "loss": 2.2333, "num_input_tokens_seen": 15463988928, "step": 29500 }, { "epoch": 0.281431484551081, "eval_loss": 2.1445696353912354, "eval_runtime": 82.7914, "eval_samples_per_second": 60.393, "eval_steps_per_second": 15.098, "num_input_tokens_seen": 15463988928, "step": 29500 }, { "epoch": 0.2819084870672693, "grad_norm": 0.1480429619550705, "learning_rate": 0.001, "loss": 2.2247, "num_input_tokens_seen": 15490198688, "step": 29550 }, { "epoch": 0.28238548958345755, "grad_norm": 0.1438407003879547, "learning_rate": 0.001, "loss": 2.2394, "num_input_tokens_seen": 15516405824, "step": 29600 }, { "epoch": 0.2828624920996458, "grad_norm": 0.14721985161304474, "learning_rate": 0.001, "loss": 2.2281, "num_input_tokens_seen": 15542610624, "step": 29650 }, { "epoch": 0.2833394946158341, "grad_norm": 0.17293605208396912, "learning_rate": 0.001, "loss": 2.2306, "num_input_tokens_seen": 15568821408, "step": 29700 }, { "epoch": 0.2838164971320224, "grad_norm": 0.14340583980083466, "learning_rate": 0.001, "loss": 2.2249, "num_input_tokens_seen": 15595031840, "step": 29750 }, { "epoch": 0.28429349964821066, "grad_norm": 0.14480094611644745, "learning_rate": 0.001, "loss": 2.2238, "num_input_tokens_seen": 15621240512, "step": 29800 }, { "epoch": 0.2847705021643989, "grad_norm": 0.13383765518665314, "learning_rate": 0.001, "loss": 2.2129, "num_input_tokens_seen": 15647444192, "step": 29850 }, { "epoch": 0.2852475046805872, "grad_norm": 0.1253250688314438, "learning_rate": 0.001, "loss": 2.2237, "num_input_tokens_seen": 15673657408, "step": 29900 }, { "epoch": 0.28572450719677545, "grad_norm": 0.14244495332241058, "learning_rate": 0.001, "loss": 2.2189, "num_input_tokens_seen": 15699867008, "step": 29950 }, { "epoch": 0.2862015097129637, "grad_norm": 0.14013972878456116, "learning_rate": 0.001, "loss": 2.2241, "num_input_tokens_seen": 15726072896, "step": 30000 }, { "epoch": 0.2862015097129637, "eval_loss": 2.142565965652466, "eval_runtime": 82.4636, "eval_samples_per_second": 60.633, "eval_steps_per_second": 15.158, "num_input_tokens_seen": 15726072896, "step": 30000 }, { "epoch": 0.286678512229152, "grad_norm": 0.1321924477815628, "learning_rate": 0.001, "loss": 2.2292, "num_input_tokens_seen": 15752287296, "step": 30050 }, { "epoch": 0.2871555147453403, "grad_norm": 0.13304558396339417, "learning_rate": 0.001, "loss": 2.2254, "num_input_tokens_seen": 15778491232, "step": 30100 }, { "epoch": 0.28763251726152855, "grad_norm": 0.146531879901886, "learning_rate": 0.001, "loss": 2.231, "num_input_tokens_seen": 15804705632, "step": 30150 }, { "epoch": 0.2881095197777168, "grad_norm": 0.14188075065612793, "learning_rate": 0.001, "loss": 2.2163, "num_input_tokens_seen": 15830915584, "step": 30200 }, { "epoch": 0.2885865222939051, "grad_norm": 0.1407197266817093, "learning_rate": 0.001, "loss": 2.2344, "num_input_tokens_seen": 15857128832, "step": 30250 }, { "epoch": 0.2890635248100934, "grad_norm": 0.137710839509964, "learning_rate": 0.001, "loss": 2.2233, "num_input_tokens_seen": 15883341600, "step": 30300 }, { "epoch": 0.28954052732628166, "grad_norm": 0.15242904424667358, "learning_rate": 0.001, "loss": 2.2306, "num_input_tokens_seen": 15909551680, "step": 30350 }, { "epoch": 0.2900175298424699, "grad_norm": 0.1370503157377243, "learning_rate": 0.001, "loss": 2.2259, "num_input_tokens_seen": 15935762624, "step": 30400 }, { "epoch": 0.2904945323586582, "grad_norm": 0.14922258257865906, "learning_rate": 0.001, "loss": 2.2201, "num_input_tokens_seen": 15961972960, "step": 30450 }, { "epoch": 0.29097153487484645, "grad_norm": 0.15227100253105164, "learning_rate": 0.001, "loss": 2.2318, "num_input_tokens_seen": 15988180544, "step": 30500 }, { "epoch": 0.29097153487484645, "eval_loss": 2.1403589248657227, "eval_runtime": 82.3532, "eval_samples_per_second": 60.714, "eval_steps_per_second": 15.179, "num_input_tokens_seen": 15988180544, "step": 30500 }, { "epoch": 0.2914485373910347, "grad_norm": 0.14497631788253784, "learning_rate": 0.001, "loss": 2.2362, "num_input_tokens_seen": 16014385184, "step": 30550 }, { "epoch": 0.29192553990722303, "grad_norm": 0.16184133291244507, "learning_rate": 0.001, "loss": 2.2189, "num_input_tokens_seen": 16040596480, "step": 30600 }, { "epoch": 0.2924025424234113, "grad_norm": 0.1632627546787262, "learning_rate": 0.001, "loss": 2.2354, "num_input_tokens_seen": 16066805632, "step": 30650 }, { "epoch": 0.29287954493959956, "grad_norm": 0.13771861791610718, "learning_rate": 0.001, "loss": 2.2261, "num_input_tokens_seen": 16093020000, "step": 30700 }, { "epoch": 0.2933565474557878, "grad_norm": 0.13185714185237885, "learning_rate": 0.001, "loss": 2.2269, "num_input_tokens_seen": 16119232416, "step": 30750 }, { "epoch": 0.2938335499719761, "grad_norm": 0.1477263867855072, "learning_rate": 0.001, "loss": 2.2512, "num_input_tokens_seen": 16145442176, "step": 30800 }, { "epoch": 0.2943105524881644, "grad_norm": 0.13301660120487213, "learning_rate": 0.001, "loss": 2.2361, "num_input_tokens_seen": 16171653888, "step": 30850 }, { "epoch": 0.29478755500435266, "grad_norm": 0.1365855634212494, "learning_rate": 0.001, "loss": 2.2314, "num_input_tokens_seen": 16197866368, "step": 30900 }, { "epoch": 0.2952645575205409, "grad_norm": 0.13683389127254486, "learning_rate": 0.001, "loss": 2.2174, "num_input_tokens_seen": 16224078272, "step": 30950 }, { "epoch": 0.2957415600367292, "grad_norm": 0.14148303866386414, "learning_rate": 0.001, "loss": 2.2156, "num_input_tokens_seen": 16250292672, "step": 31000 }, { "epoch": 0.2957415600367292, "eval_loss": 2.1403136253356934, "eval_runtime": 82.1466, "eval_samples_per_second": 60.867, "eval_steps_per_second": 15.217, "num_input_tokens_seen": 16250292672, "step": 31000 }, { "epoch": 0.29621856255291745, "grad_norm": 0.13794481754302979, "learning_rate": 0.001, "loss": 2.2231, "num_input_tokens_seen": 16276495104, "step": 31050 }, { "epoch": 0.2966955650691057, "grad_norm": 0.13724444806575775, "learning_rate": 0.001, "loss": 2.2262, "num_input_tokens_seen": 16302705760, "step": 31100 }, { "epoch": 0.29717256758529403, "grad_norm": 0.14742279052734375, "learning_rate": 0.001, "loss": 2.2117, "num_input_tokens_seen": 16328920160, "step": 31150 }, { "epoch": 0.2976495701014823, "grad_norm": 0.15125079452991486, "learning_rate": 0.001, "loss": 2.23, "num_input_tokens_seen": 16355134560, "step": 31200 }, { "epoch": 0.29812657261767056, "grad_norm": 0.13968023657798767, "learning_rate": 0.001, "loss": 2.2199, "num_input_tokens_seen": 16381347072, "step": 31250 }, { "epoch": 0.2986035751338588, "grad_norm": 0.1456058770418167, "learning_rate": 0.001, "loss": 2.2239, "num_input_tokens_seen": 16407551584, "step": 31300 }, { "epoch": 0.2990805776500471, "grad_norm": 0.1414702981710434, "learning_rate": 0.001, "loss": 2.2312, "num_input_tokens_seen": 16433761632, "step": 31350 }, { "epoch": 0.2995575801662354, "grad_norm": 0.14494173228740692, "learning_rate": 0.001, "loss": 2.2159, "num_input_tokens_seen": 16459971584, "step": 31400 }, { "epoch": 0.30003458268242367, "grad_norm": 0.1383238434791565, "learning_rate": 0.001, "loss": 2.2192, "num_input_tokens_seen": 16486183552, "step": 31450 }, { "epoch": 0.30051158519861193, "grad_norm": 0.1489211916923523, "learning_rate": 0.001, "loss": 2.2184, "num_input_tokens_seen": 16512390176, "step": 31500 }, { "epoch": 0.30051158519861193, "eval_loss": 2.137352466583252, "eval_runtime": 82.2794, "eval_samples_per_second": 60.769, "eval_steps_per_second": 15.192, "num_input_tokens_seen": 16512390176, "step": 31500 }, { "epoch": 0.3009885877148002, "grad_norm": 0.13963983952999115, "learning_rate": 0.001, "loss": 2.2169, "num_input_tokens_seen": 16538590176, "step": 31550 }, { "epoch": 0.30146559023098846, "grad_norm": 0.1359083205461502, "learning_rate": 0.001, "loss": 2.2186, "num_input_tokens_seen": 16564803488, "step": 31600 }, { "epoch": 0.3019425927471767, "grad_norm": 0.14997832477092743, "learning_rate": 0.001, "loss": 2.2268, "num_input_tokens_seen": 16591011264, "step": 31650 }, { "epoch": 0.30241959526336504, "grad_norm": 0.15025894343852997, "learning_rate": 0.001, "loss": 2.2057, "num_input_tokens_seen": 16617204384, "step": 31700 }, { "epoch": 0.3028965977795533, "grad_norm": 0.15080207586288452, "learning_rate": 0.001, "loss": 2.2183, "num_input_tokens_seen": 16643416384, "step": 31750 }, { "epoch": 0.30337360029574156, "grad_norm": 0.14180131256580353, "learning_rate": 0.001, "loss": 2.2197, "num_input_tokens_seen": 16669626720, "step": 31800 }, { "epoch": 0.3038506028119298, "grad_norm": 0.13369297981262207, "learning_rate": 0.001, "loss": 2.2204, "num_input_tokens_seen": 16695839904, "step": 31850 }, { "epoch": 0.3043276053281181, "grad_norm": 0.12933358550071716, "learning_rate": 0.001, "loss": 2.2146, "num_input_tokens_seen": 16722049312, "step": 31900 }, { "epoch": 0.30480460784430635, "grad_norm": 0.14158771932125092, "learning_rate": 0.001, "loss": 2.2189, "num_input_tokens_seen": 16748263712, "step": 31950 }, { "epoch": 0.30528161036049467, "grad_norm": 0.13659563660621643, "learning_rate": 0.001, "loss": 2.2261, "num_input_tokens_seen": 16774478112, "step": 32000 }, { "epoch": 0.30528161036049467, "eval_loss": 2.134640693664551, "eval_runtime": 82.2448, "eval_samples_per_second": 60.794, "eval_steps_per_second": 15.199, "num_input_tokens_seen": 16774478112, "step": 32000 }, { "epoch": 0.30575861287668293, "grad_norm": 0.1510363072156906, "learning_rate": 0.001, "loss": 2.2115, "num_input_tokens_seen": 16800686624, "step": 32050 }, { "epoch": 0.3062356153928712, "grad_norm": 0.13621051609516144, "learning_rate": 0.001, "loss": 2.2137, "num_input_tokens_seen": 16826897024, "step": 32100 }, { "epoch": 0.30671261790905946, "grad_norm": 0.15500463545322418, "learning_rate": 0.001, "loss": 2.2202, "num_input_tokens_seen": 16853103904, "step": 32150 }, { "epoch": 0.3071896204252477, "grad_norm": 0.17024828493595123, "learning_rate": 0.001, "loss": 2.2153, "num_input_tokens_seen": 16879308320, "step": 32200 }, { "epoch": 0.30766662294143604, "grad_norm": 0.14913325011730194, "learning_rate": 0.001, "loss": 2.2106, "num_input_tokens_seen": 16905522592, "step": 32250 }, { "epoch": 0.3081436254576243, "grad_norm": 0.1486227959394455, "learning_rate": 0.001, "loss": 2.2141, "num_input_tokens_seen": 16931727424, "step": 32300 }, { "epoch": 0.30862062797381257, "grad_norm": 0.1393032670021057, "learning_rate": 0.001, "loss": 2.2127, "num_input_tokens_seen": 16957941504, "step": 32350 }, { "epoch": 0.30909763049000083, "grad_norm": 0.13792607188224792, "learning_rate": 0.001, "loss": 2.2021, "num_input_tokens_seen": 16984153952, "step": 32400 }, { "epoch": 0.3095746330061891, "grad_norm": 0.14202407002449036, "learning_rate": 0.001, "loss": 2.2192, "num_input_tokens_seen": 17010368352, "step": 32450 }, { "epoch": 0.31005163552237736, "grad_norm": 0.15250712633132935, "learning_rate": 0.001, "loss": 2.2091, "num_input_tokens_seen": 17036582752, "step": 32500 }, { "epoch": 0.31005163552237736, "eval_loss": 2.1334831714630127, "eval_runtime": 82.4889, "eval_samples_per_second": 60.614, "eval_steps_per_second": 15.154, "num_input_tokens_seen": 17036582752, "step": 32500 }, { "epoch": 0.3105286380385657, "grad_norm": 0.15931129455566406, "learning_rate": 0.001, "loss": 2.2171, "num_input_tokens_seen": 17062797152, "step": 32550 }, { "epoch": 0.31100564055475394, "grad_norm": 0.1495935022830963, "learning_rate": 0.001, "loss": 2.2178, "num_input_tokens_seen": 17089009728, "step": 32600 }, { "epoch": 0.3114826430709422, "grad_norm": 0.1444777250289917, "learning_rate": 0.001, "loss": 2.2206, "num_input_tokens_seen": 17115219360, "step": 32650 }, { "epoch": 0.31195964558713046, "grad_norm": 0.13968896865844727, "learning_rate": 0.001, "loss": 2.213, "num_input_tokens_seen": 17141432480, "step": 32700 }, { "epoch": 0.3124366481033187, "grad_norm": 0.14426162838935852, "learning_rate": 0.001, "loss": 2.2091, "num_input_tokens_seen": 17167643776, "step": 32750 }, { "epoch": 0.31291365061950704, "grad_norm": 0.15707091987133026, "learning_rate": 0.001, "loss": 2.219, "num_input_tokens_seen": 17193858176, "step": 32800 }, { "epoch": 0.3133906531356953, "grad_norm": 0.14893439412117004, "learning_rate": 0.001, "loss": 2.2213, "num_input_tokens_seen": 17220072576, "step": 32850 }, { "epoch": 0.31386765565188357, "grad_norm": 0.15472280979156494, "learning_rate": 0.001, "loss": 2.2195, "num_input_tokens_seen": 17246283712, "step": 32900 }, { "epoch": 0.31434465816807183, "grad_norm": 0.13622242212295532, "learning_rate": 0.001, "loss": 2.2008, "num_input_tokens_seen": 17272492064, "step": 32950 }, { "epoch": 0.3148216606842601, "grad_norm": 0.14335715770721436, "learning_rate": 0.001, "loss": 2.2222, "num_input_tokens_seen": 17298706464, "step": 33000 }, { "epoch": 0.3148216606842601, "eval_loss": 2.130917549133301, "eval_runtime": 81.5143, "eval_samples_per_second": 61.339, "eval_steps_per_second": 15.335, "num_input_tokens_seen": 17298706464, "step": 33000 }, { "epoch": 0.31529866320044836, "grad_norm": 0.138872429728508, "learning_rate": 0.001, "loss": 2.2158, "num_input_tokens_seen": 17324910368, "step": 33050 }, { "epoch": 0.3157756657166367, "grad_norm": 0.1603110432624817, "learning_rate": 0.001, "loss": 2.2118, "num_input_tokens_seen": 17351124768, "step": 33100 }, { "epoch": 0.31625266823282494, "grad_norm": 0.15042147040367126, "learning_rate": 0.001, "loss": 2.218, "num_input_tokens_seen": 17377337600, "step": 33150 }, { "epoch": 0.3167296707490132, "grad_norm": 0.13590936362743378, "learning_rate": 0.001, "loss": 2.2239, "num_input_tokens_seen": 17403547008, "step": 33200 }, { "epoch": 0.31720667326520147, "grad_norm": 0.14794215559959412, "learning_rate": 0.001, "loss": 2.2039, "num_input_tokens_seen": 17429759456, "step": 33250 }, { "epoch": 0.31768367578138973, "grad_norm": 0.15067002177238464, "learning_rate": 0.001, "loss": 2.2089, "num_input_tokens_seen": 17455966240, "step": 33300 }, { "epoch": 0.318160678297578, "grad_norm": 0.13929149508476257, "learning_rate": 0.001, "loss": 2.2056, "num_input_tokens_seen": 17482177952, "step": 33350 }, { "epoch": 0.3186376808137663, "grad_norm": 0.13707947731018066, "learning_rate": 0.001, "loss": 2.2194, "num_input_tokens_seen": 17508390848, "step": 33400 }, { "epoch": 0.3191146833299546, "grad_norm": 0.13600605726242065, "learning_rate": 0.001, "loss": 2.2219, "num_input_tokens_seen": 17534602528, "step": 33450 }, { "epoch": 0.31959168584614284, "grad_norm": 0.16074150800704956, "learning_rate": 0.001, "loss": 2.2172, "num_input_tokens_seen": 17560813216, "step": 33500 }, { "epoch": 0.31959168584614284, "eval_loss": 2.1288042068481445, "eval_runtime": 82.01, "eval_samples_per_second": 60.968, "eval_steps_per_second": 15.242, "num_input_tokens_seen": 17560813216, "step": 33500 }, { "epoch": 0.3200686883623311, "grad_norm": 0.1537347286939621, "learning_rate": 0.001, "loss": 2.2128, "num_input_tokens_seen": 17587027616, "step": 33550 }, { "epoch": 0.32054569087851936, "grad_norm": 0.15225750207901, "learning_rate": 0.001, "loss": 2.2182, "num_input_tokens_seen": 17613240704, "step": 33600 }, { "epoch": 0.3210226933947077, "grad_norm": 0.17453214526176453, "learning_rate": 0.001, "loss": 2.2089, "num_input_tokens_seen": 17639452224, "step": 33650 }, { "epoch": 0.32149969591089594, "grad_norm": 0.13869380950927734, "learning_rate": 0.001, "loss": 2.2164, "num_input_tokens_seen": 17665664864, "step": 33700 }, { "epoch": 0.3219766984270842, "grad_norm": 0.1399545818567276, "learning_rate": 0.001, "loss": 2.2066, "num_input_tokens_seen": 17691877472, "step": 33750 }, { "epoch": 0.32245370094327247, "grad_norm": 0.15858028829097748, "learning_rate": 0.001, "loss": 2.2062, "num_input_tokens_seen": 17718076992, "step": 33800 }, { "epoch": 0.32293070345946073, "grad_norm": 0.14098668098449707, "learning_rate": 0.001, "loss": 2.2124, "num_input_tokens_seen": 17744290080, "step": 33850 }, { "epoch": 0.323407705975649, "grad_norm": 0.15374623239040375, "learning_rate": 0.001, "loss": 2.2017, "num_input_tokens_seen": 17770502944, "step": 33900 }, { "epoch": 0.3238847084918373, "grad_norm": 0.15962567925453186, "learning_rate": 0.001, "loss": 2.2095, "num_input_tokens_seen": 17796710240, "step": 33950 }, { "epoch": 0.3243617110080256, "grad_norm": 0.14005784690380096, "learning_rate": 0.001, "loss": 2.2003, "num_input_tokens_seen": 17822921152, "step": 34000 }, { "epoch": 0.3243617110080256, "eval_loss": 2.127488613128662, "eval_runtime": 82.0715, "eval_samples_per_second": 60.923, "eval_steps_per_second": 15.231, "num_input_tokens_seen": 17822921152, "step": 34000 }, { "epoch": 0.32483871352421384, "grad_norm": 0.15280354022979736, "learning_rate": 0.001, "loss": 2.2068, "num_input_tokens_seen": 17849135552, "step": 34050 }, { "epoch": 0.3253157160404021, "grad_norm": 0.13673514127731323, "learning_rate": 0.001, "loss": 2.1972, "num_input_tokens_seen": 17875349632, "step": 34100 }, { "epoch": 0.32579271855659037, "grad_norm": 0.13414499163627625, "learning_rate": 0.001, "loss": 2.2037, "num_input_tokens_seen": 17901564032, "step": 34150 }, { "epoch": 0.3262697210727787, "grad_norm": 0.16514766216278076, "learning_rate": 0.001, "loss": 2.2569, "num_input_tokens_seen": 17927772512, "step": 34200 }, { "epoch": 0.32674672358896695, "grad_norm": 0.14111830294132233, "learning_rate": 0.001, "loss": 2.2395, "num_input_tokens_seen": 17953984416, "step": 34250 }, { "epoch": 0.3272237261051552, "grad_norm": 0.1439828723669052, "learning_rate": 0.001, "loss": 2.2155, "num_input_tokens_seen": 17980194464, "step": 34300 }, { "epoch": 0.3277007286213435, "grad_norm": 0.13390502333641052, "learning_rate": 0.001, "loss": 2.2226, "num_input_tokens_seen": 18006404832, "step": 34350 }, { "epoch": 0.32817773113753174, "grad_norm": 0.1647501289844513, "learning_rate": 0.001, "loss": 2.2113, "num_input_tokens_seen": 18032606304, "step": 34400 }, { "epoch": 0.32865473365372, "grad_norm": 0.14238382875919342, "learning_rate": 0.001, "loss": 2.219, "num_input_tokens_seen": 18058813856, "step": 34450 }, { "epoch": 0.3291317361699083, "grad_norm": 0.13778063654899597, "learning_rate": 0.001, "loss": 2.2152, "num_input_tokens_seen": 18085018528, "step": 34500 }, { "epoch": 0.3291317361699083, "eval_loss": 2.1281445026397705, "eval_runtime": 81.5943, "eval_samples_per_second": 61.279, "eval_steps_per_second": 15.32, "num_input_tokens_seen": 18085018528, "step": 34500 }, { "epoch": 0.3296087386860966, "grad_norm": 0.144911527633667, "learning_rate": 0.001, "loss": 2.2176, "num_input_tokens_seen": 18111225184, "step": 34550 }, { "epoch": 0.33008574120228484, "grad_norm": 0.1375516951084137, "learning_rate": 0.001, "loss": 2.2034, "num_input_tokens_seen": 18137439584, "step": 34600 }, { "epoch": 0.3305627437184731, "grad_norm": 0.1483387053012848, "learning_rate": 0.001, "loss": 2.2139, "num_input_tokens_seen": 18163649536, "step": 34650 }, { "epoch": 0.33103974623466137, "grad_norm": 0.13756632804870605, "learning_rate": 0.001, "loss": 2.2142, "num_input_tokens_seen": 18189863936, "step": 34700 }, { "epoch": 0.3315167487508497, "grad_norm": 0.15277694165706635, "learning_rate": 0.001, "loss": 2.2, "num_input_tokens_seen": 18216078336, "step": 34750 }, { "epoch": 0.33199375126703795, "grad_norm": 0.1554708331823349, "learning_rate": 0.001, "loss": 2.1976, "num_input_tokens_seen": 18242291840, "step": 34800 }, { "epoch": 0.3324707537832262, "grad_norm": 0.15377846360206604, "learning_rate": 0.001, "loss": 2.2152, "num_input_tokens_seen": 18268506240, "step": 34850 }, { "epoch": 0.3329477562994145, "grad_norm": 0.14748121798038483, "learning_rate": 0.001, "loss": 2.2081, "num_input_tokens_seen": 18294720640, "step": 34900 }, { "epoch": 0.33342475881560274, "grad_norm": 0.13819515705108643, "learning_rate": 0.001, "loss": 2.2003, "num_input_tokens_seen": 18320932768, "step": 34950 }, { "epoch": 0.333901761331791, "grad_norm": 0.15223775804042816, "learning_rate": 0.001, "loss": 2.2076, "num_input_tokens_seen": 18347147168, "step": 35000 }, { "epoch": 0.333901761331791, "eval_loss": 2.1236226558685303, "eval_runtime": 82.2565, "eval_samples_per_second": 60.785, "eval_steps_per_second": 15.196, "num_input_tokens_seen": 18347147168, "step": 35000 }, { "epoch": 0.3343787638479793, "grad_norm": 0.2939516603946686, "learning_rate": 0.001, "loss": 2.1954, "num_input_tokens_seen": 18373355744, "step": 35050 }, { "epoch": 0.3348557663641676, "grad_norm": 0.1514858454465866, "learning_rate": 0.001, "loss": 2.1988, "num_input_tokens_seen": 18399565696, "step": 35100 }, { "epoch": 0.33533276888035585, "grad_norm": 0.1336488127708435, "learning_rate": 0.001, "loss": 2.2029, "num_input_tokens_seen": 18425766080, "step": 35150 }, { "epoch": 0.3358097713965441, "grad_norm": 0.14635372161865234, "learning_rate": 0.001, "loss": 2.2098, "num_input_tokens_seen": 18451970176, "step": 35200 }, { "epoch": 0.3362867739127324, "grad_norm": 0.13536489009857178, "learning_rate": 0.001, "loss": 2.2025, "num_input_tokens_seen": 18478179808, "step": 35250 }, { "epoch": 0.33676377642892064, "grad_norm": 0.13300848007202148, "learning_rate": 0.001, "loss": 2.2052, "num_input_tokens_seen": 18504390624, "step": 35300 }, { "epoch": 0.33724077894510895, "grad_norm": 0.1379849761724472, "learning_rate": 0.001, "loss": 2.2107, "num_input_tokens_seen": 18530605024, "step": 35350 }, { "epoch": 0.3377177814612972, "grad_norm": 0.14962108433246613, "learning_rate": 0.001, "loss": 2.2162, "num_input_tokens_seen": 18556814720, "step": 35400 }, { "epoch": 0.3381947839774855, "grad_norm": 0.13894064724445343, "learning_rate": 0.001, "loss": 2.2014, "num_input_tokens_seen": 18583020256, "step": 35450 }, { "epoch": 0.33867178649367374, "grad_norm": 0.13783761858940125, "learning_rate": 0.001, "loss": 2.2005, "num_input_tokens_seen": 18609222656, "step": 35500 }, { "epoch": 0.33867178649367374, "eval_loss": 2.1224164962768555, "eval_runtime": 82.5781, "eval_samples_per_second": 60.549, "eval_steps_per_second": 15.137, "num_input_tokens_seen": 18609222656, "step": 35500 }, { "epoch": 0.339148789009862, "grad_norm": 0.1444152593612671, "learning_rate": 0.001, "loss": 2.2034, "num_input_tokens_seen": 18635432384, "step": 35550 }, { "epoch": 0.3396257915260503, "grad_norm": 0.1575513333082199, "learning_rate": 0.001, "loss": 2.2109, "num_input_tokens_seen": 18661645568, "step": 35600 }, { "epoch": 0.3401027940422386, "grad_norm": 0.138763889670372, "learning_rate": 0.001, "loss": 2.1982, "num_input_tokens_seen": 18687859968, "step": 35650 }, { "epoch": 0.34057979655842685, "grad_norm": 0.16039063036441803, "learning_rate": 0.001, "loss": 2.2078, "num_input_tokens_seen": 18714071840, "step": 35700 }, { "epoch": 0.3410567990746151, "grad_norm": 0.1333894282579422, "learning_rate": 0.001, "loss": 2.2066, "num_input_tokens_seen": 18740277952, "step": 35750 }, { "epoch": 0.3415338015908034, "grad_norm": 0.15125052630901337, "learning_rate": 0.001, "loss": 2.2071, "num_input_tokens_seen": 18766492352, "step": 35800 }, { "epoch": 0.34201080410699164, "grad_norm": 0.13908621668815613, "learning_rate": 0.001, "loss": 2.1832, "num_input_tokens_seen": 18792706752, "step": 35850 }, { "epoch": 0.34248780662317996, "grad_norm": 0.1455029994249344, "learning_rate": 0.001, "loss": 2.2048, "num_input_tokens_seen": 18818913376, "step": 35900 }, { "epoch": 0.3429648091393682, "grad_norm": 0.14915207028388977, "learning_rate": 0.001, "loss": 2.2016, "num_input_tokens_seen": 18845127776, "step": 35950 }, { "epoch": 0.3434418116555565, "grad_norm": 0.15712140500545502, "learning_rate": 0.001, "loss": 2.1975, "num_input_tokens_seen": 18871341760, "step": 36000 }, { "epoch": 0.3434418116555565, "eval_loss": 2.119873523712158, "eval_runtime": 82.5855, "eval_samples_per_second": 60.543, "eval_steps_per_second": 15.136, "num_input_tokens_seen": 18871341760, "step": 36000 }, { "epoch": 0.34391881417174475, "grad_norm": 0.12918758392333984, "learning_rate": 0.001, "loss": 2.2082, "num_input_tokens_seen": 18897547296, "step": 36050 }, { "epoch": 0.344395816687933, "grad_norm": 0.16404707729816437, "learning_rate": 0.001, "loss": 2.2093, "num_input_tokens_seen": 18923754720, "step": 36100 }, { "epoch": 0.34487281920412133, "grad_norm": 0.1493569314479828, "learning_rate": 0.001, "loss": 2.2034, "num_input_tokens_seen": 18949959648, "step": 36150 }, { "epoch": 0.3453498217203096, "grad_norm": 0.13502757251262665, "learning_rate": 0.001, "loss": 2.2005, "num_input_tokens_seen": 18976172032, "step": 36200 }, { "epoch": 0.34582682423649785, "grad_norm": 0.148860901594162, "learning_rate": 0.001, "loss": 2.1922, "num_input_tokens_seen": 19002385664, "step": 36250 }, { "epoch": 0.3463038267526861, "grad_norm": 0.13627836108207703, "learning_rate": 0.001, "loss": 2.2087, "num_input_tokens_seen": 19028600064, "step": 36300 }, { "epoch": 0.3467808292688744, "grad_norm": 0.1387259066104889, "learning_rate": 0.001, "loss": 2.1953, "num_input_tokens_seen": 19054810240, "step": 36350 }, { "epoch": 0.34725783178506264, "grad_norm": 0.15676312148571014, "learning_rate": 0.001, "loss": 2.2, "num_input_tokens_seen": 19081021984, "step": 36400 }, { "epoch": 0.34773483430125096, "grad_norm": 0.1438487023115158, "learning_rate": 0.001, "loss": 2.2066, "num_input_tokens_seen": 19107227616, "step": 36450 }, { "epoch": 0.3482118368174392, "grad_norm": 0.1509568840265274, "learning_rate": 0.001, "loss": 2.201, "num_input_tokens_seen": 19133442016, "step": 36500 }, { "epoch": 0.3482118368174392, "eval_loss": 2.117922306060791, "eval_runtime": 82.7969, "eval_samples_per_second": 60.389, "eval_steps_per_second": 15.097, "num_input_tokens_seen": 19133442016, "step": 36500 }, { "epoch": 0.3486888393336275, "grad_norm": 0.14427947998046875, "learning_rate": 0.001, "loss": 2.2007, "num_input_tokens_seen": 19159654944, "step": 36550 }, { "epoch": 0.34916584184981575, "grad_norm": 0.1476566195487976, "learning_rate": 0.001, "loss": 2.197, "num_input_tokens_seen": 19185859168, "step": 36600 }, { "epoch": 0.349642844366004, "grad_norm": 0.13864775002002716, "learning_rate": 0.001, "loss": 2.201, "num_input_tokens_seen": 19212073568, "step": 36650 }, { "epoch": 0.3501198468821923, "grad_norm": 0.14173488318920135, "learning_rate": 0.001, "loss": 2.2017, "num_input_tokens_seen": 19238287968, "step": 36700 }, { "epoch": 0.3505968493983806, "grad_norm": 0.1381407082080841, "learning_rate": 0.001, "loss": 2.2103, "num_input_tokens_seen": 19264502368, "step": 36750 }, { "epoch": 0.35107385191456886, "grad_norm": 0.15552376210689545, "learning_rate": 0.001, "loss": 2.2028, "num_input_tokens_seen": 19290709856, "step": 36800 }, { "epoch": 0.3515508544307571, "grad_norm": 0.15400223433971405, "learning_rate": 0.001, "loss": 2.1996, "num_input_tokens_seen": 19316919616, "step": 36850 }, { "epoch": 0.3520278569469454, "grad_norm": 0.14455287158489227, "learning_rate": 0.001, "loss": 2.1958, "num_input_tokens_seen": 19343128480, "step": 36900 }, { "epoch": 0.35250485946313365, "grad_norm": 0.13802266120910645, "learning_rate": 0.001, "loss": 2.2072, "num_input_tokens_seen": 19369341216, "step": 36950 }, { "epoch": 0.35298186197932196, "grad_norm": 0.1613599807024002, "learning_rate": 0.001, "loss": 2.1968, "num_input_tokens_seen": 19395541920, "step": 37000 }, { "epoch": 0.35298186197932196, "eval_loss": 2.116856336593628, "eval_runtime": 82.1293, "eval_samples_per_second": 60.88, "eval_steps_per_second": 15.22, "num_input_tokens_seen": 19395541920, "step": 37000 }, { "epoch": 0.3534588644955102, "grad_norm": 0.14134734869003296, "learning_rate": 0.001, "loss": 2.1956, "num_input_tokens_seen": 19421746432, "step": 37050 }, { "epoch": 0.3539358670116985, "grad_norm": 0.13640139997005463, "learning_rate": 0.001, "loss": 2.1917, "num_input_tokens_seen": 19447955168, "step": 37100 }, { "epoch": 0.35441286952788675, "grad_norm": 0.15265345573425293, "learning_rate": 0.001, "loss": 2.1883, "num_input_tokens_seen": 19474169568, "step": 37150 }, { "epoch": 0.354889872044075, "grad_norm": 0.1389395147562027, "learning_rate": 0.001, "loss": 2.2005, "num_input_tokens_seen": 19500383968, "step": 37200 }, { "epoch": 0.3553668745602633, "grad_norm": 0.1275651752948761, "learning_rate": 0.001, "loss": 2.203, "num_input_tokens_seen": 19526592832, "step": 37250 }, { "epoch": 0.3558438770764516, "grad_norm": 0.1409410983324051, "learning_rate": 0.001, "loss": 2.204, "num_input_tokens_seen": 19552803712, "step": 37300 }, { "epoch": 0.35632087959263986, "grad_norm": 0.14228691160678864, "learning_rate": 0.001, "loss": 2.2163, "num_input_tokens_seen": 19579003200, "step": 37350 }, { "epoch": 0.3567978821088281, "grad_norm": 0.13135015964508057, "learning_rate": 0.001, "loss": 2.1991, "num_input_tokens_seen": 19605215264, "step": 37400 }, { "epoch": 0.3572748846250164, "grad_norm": 0.14535236358642578, "learning_rate": 0.001, "loss": 2.2015, "num_input_tokens_seen": 19631429664, "step": 37450 }, { "epoch": 0.35775188714120465, "grad_norm": 0.14450703561306, "learning_rate": 0.001, "loss": 2.204, "num_input_tokens_seen": 19657633088, "step": 37500 }, { "epoch": 0.35775188714120465, "eval_loss": 2.1144189834594727, "eval_runtime": 82.3004, "eval_samples_per_second": 60.753, "eval_steps_per_second": 15.188, "num_input_tokens_seen": 19657633088, "step": 37500 }, { "epoch": 0.35822888965739297, "grad_norm": 0.14302517473697662, "learning_rate": 0.001, "loss": 2.2022, "num_input_tokens_seen": 19683846464, "step": 37550 }, { "epoch": 0.35870589217358123, "grad_norm": 0.13624528050422668, "learning_rate": 0.001, "loss": 2.2009, "num_input_tokens_seen": 19710059744, "step": 37600 }, { "epoch": 0.3591828946897695, "grad_norm": 0.14689582586288452, "learning_rate": 0.001, "loss": 2.1933, "num_input_tokens_seen": 19736267520, "step": 37650 }, { "epoch": 0.35965989720595776, "grad_norm": 0.1342869997024536, "learning_rate": 0.001, "loss": 2.1965, "num_input_tokens_seen": 19762479264, "step": 37700 }, { "epoch": 0.360136899722146, "grad_norm": 0.14294207096099854, "learning_rate": 0.001, "loss": 2.207, "num_input_tokens_seen": 19788688896, "step": 37750 }, { "epoch": 0.3606139022383343, "grad_norm": 0.13254527747631073, "learning_rate": 0.001, "loss": 2.2012, "num_input_tokens_seen": 19814900576, "step": 37800 }, { "epoch": 0.3610909047545226, "grad_norm": 0.1584700047969818, "learning_rate": 0.001, "loss": 2.1898, "num_input_tokens_seen": 19841105472, "step": 37850 }, { "epoch": 0.36156790727071086, "grad_norm": 0.14291736483573914, "learning_rate": 0.001, "loss": 2.1938, "num_input_tokens_seen": 19867318208, "step": 37900 }, { "epoch": 0.3620449097868991, "grad_norm": 0.13364264369010925, "learning_rate": 0.001, "loss": 2.1939, "num_input_tokens_seen": 19893530176, "step": 37950 }, { "epoch": 0.3625219123030874, "grad_norm": 0.1309192031621933, "learning_rate": 0.001, "loss": 2.1979, "num_input_tokens_seen": 19919735840, "step": 38000 }, { "epoch": 0.3625219123030874, "eval_loss": 2.1131739616394043, "eval_runtime": 82.6146, "eval_samples_per_second": 60.522, "eval_steps_per_second": 15.131, "num_input_tokens_seen": 19919735840, "step": 38000 }, { "epoch": 0.36299891481927565, "grad_norm": 0.14044371247291565, "learning_rate": 0.001, "loss": 2.1977, "num_input_tokens_seen": 19945948672, "step": 38050 }, { "epoch": 0.36347591733546397, "grad_norm": 0.1485033631324768, "learning_rate": 0.001, "loss": 2.2044, "num_input_tokens_seen": 19972149696, "step": 38100 }, { "epoch": 0.36395291985165223, "grad_norm": 0.14868605136871338, "learning_rate": 0.001, "loss": 2.181, "num_input_tokens_seen": 19998356640, "step": 38150 }, { "epoch": 0.3644299223678405, "grad_norm": 0.1402069628238678, "learning_rate": 0.001, "loss": 2.2014, "num_input_tokens_seen": 20024569184, "step": 38200 }, { "epoch": 0.36490692488402876, "grad_norm": 0.14594897627830505, "learning_rate": 0.001, "loss": 2.203, "num_input_tokens_seen": 20050777216, "step": 38250 }, { "epoch": 0.365383927400217, "grad_norm": 0.13246594369411469, "learning_rate": 0.001, "loss": 2.2107, "num_input_tokens_seen": 20076989984, "step": 38300 }, { "epoch": 0.3658609299164053, "grad_norm": 0.1423010230064392, "learning_rate": 0.001, "loss": 2.2007, "num_input_tokens_seen": 20103199008, "step": 38350 }, { "epoch": 0.3663379324325936, "grad_norm": 0.1386810839176178, "learning_rate": 0.001, "loss": 2.187, "num_input_tokens_seen": 20129413248, "step": 38400 }, { "epoch": 0.36681493494878187, "grad_norm": 0.1479010432958603, "learning_rate": 0.001, "loss": 2.1924, "num_input_tokens_seen": 20155625408, "step": 38450 }, { "epoch": 0.36729193746497013, "grad_norm": 0.14512768387794495, "learning_rate": 0.001, "loss": 2.1898, "num_input_tokens_seen": 20181833600, "step": 38500 }, { "epoch": 0.36729193746497013, "eval_loss": 2.111013174057007, "eval_runtime": 82.3701, "eval_samples_per_second": 60.702, "eval_steps_per_second": 15.175, "num_input_tokens_seen": 20181833600, "step": 38500 }, { "epoch": 0.3677689399811584, "grad_norm": 0.14894255995750427, "learning_rate": 0.001, "loss": 2.1869, "num_input_tokens_seen": 20208045120, "step": 38550 }, { "epoch": 0.36824594249734666, "grad_norm": 0.140294149518013, "learning_rate": 0.001, "loss": 2.2048, "num_input_tokens_seen": 20234258528, "step": 38600 }, { "epoch": 0.3687229450135349, "grad_norm": 0.14766459167003632, "learning_rate": 0.001, "loss": 2.1961, "num_input_tokens_seen": 20260469632, "step": 38650 }, { "epoch": 0.36919994752972324, "grad_norm": 0.1636905074119568, "learning_rate": 0.001, "loss": 2.1922, "num_input_tokens_seen": 20286670944, "step": 38700 }, { "epoch": 0.3696769500459115, "grad_norm": 0.14300905168056488, "learning_rate": 0.001, "loss": 2.2001, "num_input_tokens_seen": 20312883104, "step": 38750 }, { "epoch": 0.37015395256209976, "grad_norm": 0.14150425791740417, "learning_rate": 0.001, "loss": 2.1913, "num_input_tokens_seen": 20339087360, "step": 38800 }, { "epoch": 0.370630955078288, "grad_norm": 0.13723760843276978, "learning_rate": 0.001, "loss": 2.2031, "num_input_tokens_seen": 20365290304, "step": 38850 }, { "epoch": 0.3711079575944763, "grad_norm": 0.15464797616004944, "learning_rate": 0.001, "loss": 2.1848, "num_input_tokens_seen": 20391495936, "step": 38900 }, { "epoch": 0.3715849601106646, "grad_norm": 0.15645267069339752, "learning_rate": 0.001, "loss": 2.2064, "num_input_tokens_seen": 20417707040, "step": 38950 }, { "epoch": 0.37206196262685287, "grad_norm": 0.15567755699157715, "learning_rate": 0.001, "loss": 2.1859, "num_input_tokens_seen": 20443920960, "step": 39000 }, { "epoch": 0.37206196262685287, "eval_loss": 2.110989570617676, "eval_runtime": 82.2487, "eval_samples_per_second": 60.791, "eval_steps_per_second": 15.198, "num_input_tokens_seen": 20443920960, "step": 39000 }, { "epoch": 0.37253896514304113, "grad_norm": 0.15416857600212097, "learning_rate": 0.001, "loss": 2.1945, "num_input_tokens_seen": 20470129312, "step": 39050 }, { "epoch": 0.3730159676592294, "grad_norm": 0.1459367424249649, "learning_rate": 0.001, "loss": 2.1895, "num_input_tokens_seen": 20496341472, "step": 39100 }, { "epoch": 0.37349297017541766, "grad_norm": 0.14670804142951965, "learning_rate": 0.001, "loss": 2.1992, "num_input_tokens_seen": 20522548320, "step": 39150 }, { "epoch": 0.3739699726916059, "grad_norm": 0.14550630748271942, "learning_rate": 0.001, "loss": 2.2061, "num_input_tokens_seen": 20548758304, "step": 39200 }, { "epoch": 0.37444697520779424, "grad_norm": 0.1529083102941513, "learning_rate": 0.001, "loss": 2.1914, "num_input_tokens_seen": 20574971520, "step": 39250 }, { "epoch": 0.3749239777239825, "grad_norm": 0.13946719467639923, "learning_rate": 0.001, "loss": 2.1839, "num_input_tokens_seen": 20601184288, "step": 39300 }, { "epoch": 0.37540098024017077, "grad_norm": 0.1529141515493393, "learning_rate": 0.001, "loss": 2.1962, "num_input_tokens_seen": 20627397056, "step": 39350 }, { "epoch": 0.37587798275635903, "grad_norm": 0.14381681382656097, "learning_rate": 0.001, "loss": 2.1934, "num_input_tokens_seen": 20653601696, "step": 39400 }, { "epoch": 0.3763549852725473, "grad_norm": 0.1383078545331955, "learning_rate": 0.001, "loss": 2.2031, "num_input_tokens_seen": 20679806784, "step": 39450 }, { "epoch": 0.3768319877887356, "grad_norm": 0.14761337637901306, "learning_rate": 0.001, "loss": 2.188, "num_input_tokens_seen": 20706018592, "step": 39500 }, { "epoch": 0.3768319877887356, "eval_loss": 2.1095550060272217, "eval_runtime": 82.0109, "eval_samples_per_second": 60.968, "eval_steps_per_second": 15.242, "num_input_tokens_seen": 20706018592, "step": 39500 }, { "epoch": 0.3773089903049239, "grad_norm": 0.14556308090686798, "learning_rate": 0.001, "loss": 2.1888, "num_input_tokens_seen": 20732225440, "step": 39550 }, { "epoch": 0.37778599282111214, "grad_norm": 0.14401084184646606, "learning_rate": 0.001, "loss": 2.1913, "num_input_tokens_seen": 20758439616, "step": 39600 }, { "epoch": 0.3782629953373004, "grad_norm": 0.14126934111118317, "learning_rate": 0.001, "loss": 2.1935, "num_input_tokens_seen": 20784651648, "step": 39650 }, { "epoch": 0.37873999785348866, "grad_norm": 0.1369311660528183, "learning_rate": 0.001, "loss": 2.191, "num_input_tokens_seen": 20810862880, "step": 39700 }, { "epoch": 0.3792170003696769, "grad_norm": 0.1367167979478836, "learning_rate": 0.001, "loss": 2.1953, "num_input_tokens_seen": 20837073536, "step": 39750 }, { "epoch": 0.37969400288586524, "grad_norm": 0.14433887600898743, "learning_rate": 0.001, "loss": 2.1898, "num_input_tokens_seen": 20863285568, "step": 39800 }, { "epoch": 0.3801710054020535, "grad_norm": 0.13510778546333313, "learning_rate": 0.001, "loss": 2.1825, "num_input_tokens_seen": 20889491744, "step": 39850 }, { "epoch": 0.38064800791824177, "grad_norm": 0.15846163034439087, "learning_rate": 0.001, "loss": 2.1977, "num_input_tokens_seen": 20915690304, "step": 39900 }, { "epoch": 0.38112501043443003, "grad_norm": 0.14499403536319733, "learning_rate": 0.001, "loss": 2.1809, "num_input_tokens_seen": 20941902336, "step": 39950 }, { "epoch": 0.3816020129506183, "grad_norm": 0.14737826585769653, "learning_rate": 0.001, "loss": 2.1932, "num_input_tokens_seen": 20968112960, "step": 40000 }, { "epoch": 0.3816020129506183, "eval_loss": 2.108198881149292, "eval_runtime": 81.3944, "eval_samples_per_second": 61.429, "eval_steps_per_second": 15.357, "num_input_tokens_seen": 20968112960, "step": 40000 }, { "epoch": 0.38207901546680656, "grad_norm": 0.14910832047462463, "learning_rate": 0.001, "loss": 2.195, "num_input_tokens_seen": 20994316064, "step": 40050 }, { "epoch": 0.3825560179829949, "grad_norm": 0.14551687240600586, "learning_rate": 0.001, "loss": 2.1933, "num_input_tokens_seen": 21020528128, "step": 40100 }, { "epoch": 0.38303302049918314, "grad_norm": 0.13713929057121277, "learning_rate": 0.001, "loss": 2.1921, "num_input_tokens_seen": 21046742528, "step": 40150 }, { "epoch": 0.3835100230153714, "grad_norm": 0.1518137902021408, "learning_rate": 0.001, "loss": 2.1767, "num_input_tokens_seen": 21072955296, "step": 40200 }, { "epoch": 0.38398702553155967, "grad_norm": 0.14563234150409698, "learning_rate": 0.001, "loss": 2.1913, "num_input_tokens_seen": 21099166048, "step": 40250 }, { "epoch": 0.38446402804774793, "grad_norm": 0.14845997095108032, "learning_rate": 0.001, "loss": 2.1935, "num_input_tokens_seen": 21125378816, "step": 40300 }, { "epoch": 0.38494103056393625, "grad_norm": 0.15191951394081116, "learning_rate": 0.001, "loss": 2.1873, "num_input_tokens_seen": 21151588800, "step": 40350 }, { "epoch": 0.3854180330801245, "grad_norm": 0.13842670619487762, "learning_rate": 0.001, "loss": 2.1851, "num_input_tokens_seen": 21177797408, "step": 40400 }, { "epoch": 0.3858950355963128, "grad_norm": 0.1450972706079483, "learning_rate": 0.001, "loss": 2.1998, "num_input_tokens_seen": 21204010688, "step": 40450 }, { "epoch": 0.38637203811250104, "grad_norm": 0.14317825436592102, "learning_rate": 0.001, "loss": 2.1933, "num_input_tokens_seen": 21230223584, "step": 40500 }, { "epoch": 0.38637203811250104, "eval_loss": 2.1045005321502686, "eval_runtime": 82.0473, "eval_samples_per_second": 60.94, "eval_steps_per_second": 15.235, "num_input_tokens_seen": 21230223584, "step": 40500 }, { "epoch": 0.3868490406286893, "grad_norm": 0.14561642706394196, "learning_rate": 0.001, "loss": 2.2001, "num_input_tokens_seen": 21256431712, "step": 40550 }, { "epoch": 0.38732604314487756, "grad_norm": 0.13989581167697906, "learning_rate": 0.001, "loss": 2.1761, "num_input_tokens_seen": 21282636640, "step": 40600 }, { "epoch": 0.3878030456610659, "grad_norm": 0.21038071811199188, "learning_rate": 0.001, "loss": 2.1906, "num_input_tokens_seen": 21308851040, "step": 40650 }, { "epoch": 0.38828004817725414, "grad_norm": 0.1928381323814392, "learning_rate": 0.001, "loss": 2.187, "num_input_tokens_seen": 21335059968, "step": 40700 }, { "epoch": 0.3887570506934424, "grad_norm": 0.1544865220785141, "learning_rate": 0.001, "loss": 2.1866, "num_input_tokens_seen": 21361274016, "step": 40750 }, { "epoch": 0.38923405320963067, "grad_norm": 0.13268694281578064, "learning_rate": 0.001, "loss": 2.1874, "num_input_tokens_seen": 21387478624, "step": 40800 }, { "epoch": 0.38971105572581893, "grad_norm": 0.15970471501350403, "learning_rate": 0.001, "loss": 2.1942, "num_input_tokens_seen": 21413693024, "step": 40850 }, { "epoch": 0.39018805824200725, "grad_norm": 0.15078318119049072, "learning_rate": 0.001, "loss": 2.1837, "num_input_tokens_seen": 21439892672, "step": 40900 }, { "epoch": 0.3906650607581955, "grad_norm": 0.14016783237457275, "learning_rate": 0.001, "loss": 2.1993, "num_input_tokens_seen": 21466102688, "step": 40950 }, { "epoch": 0.3911420632743838, "grad_norm": 0.13748787343502045, "learning_rate": 0.001, "loss": 2.1907, "num_input_tokens_seen": 21492310496, "step": 41000 }, { "epoch": 0.3911420632743838, "eval_loss": 2.105747699737549, "eval_runtime": 81.8993, "eval_samples_per_second": 61.051, "eval_steps_per_second": 15.263, "num_input_tokens_seen": 21492310496, "step": 41000 }, { "epoch": 0.39161906579057204, "grad_norm": 0.136869415640831, "learning_rate": 0.001, "loss": 2.1911, "num_input_tokens_seen": 21518520608, "step": 41050 }, { "epoch": 0.3920960683067603, "grad_norm": 0.15758763253688812, "learning_rate": 0.001, "loss": 2.1803, "num_input_tokens_seen": 21544732864, "step": 41100 }, { "epoch": 0.39257307082294857, "grad_norm": 0.1424143761396408, "learning_rate": 0.001, "loss": 2.1903, "num_input_tokens_seen": 21570943840, "step": 41150 }, { "epoch": 0.3930500733391369, "grad_norm": 0.14572599530220032, "learning_rate": 0.001, "loss": 2.1822, "num_input_tokens_seen": 21597153312, "step": 41200 }, { "epoch": 0.39352707585532515, "grad_norm": 0.1391698122024536, "learning_rate": 0.001, "loss": 2.1909, "num_input_tokens_seen": 21623366976, "step": 41250 }, { "epoch": 0.3940040783715134, "grad_norm": 0.13990794122219086, "learning_rate": 0.001, "loss": 2.1997, "num_input_tokens_seen": 21649573728, "step": 41300 }, { "epoch": 0.3944810808877017, "grad_norm": 0.1600644737482071, "learning_rate": 0.001, "loss": 2.186, "num_input_tokens_seen": 21675788128, "step": 41350 }, { "epoch": 0.39495808340388994, "grad_norm": 0.1378026008605957, "learning_rate": 0.001, "loss": 2.1815, "num_input_tokens_seen": 21701992096, "step": 41400 }, { "epoch": 0.39543508592007826, "grad_norm": 0.13701239228248596, "learning_rate": 0.001, "loss": 2.195, "num_input_tokens_seen": 21728199616, "step": 41450 }, { "epoch": 0.3959120884362665, "grad_norm": 0.1407209187746048, "learning_rate": 0.001, "loss": 2.1806, "num_input_tokens_seen": 21754409120, "step": 41500 }, { "epoch": 0.3959120884362665, "eval_loss": 2.1029863357543945, "eval_runtime": 82.1541, "eval_samples_per_second": 60.861, "eval_steps_per_second": 15.215, "num_input_tokens_seen": 21754409120, "step": 41500 }, { "epoch": 0.3963890909524548, "grad_norm": 0.1486450433731079, "learning_rate": 0.001, "loss": 2.1894, "num_input_tokens_seen": 21780617152, "step": 41550 }, { "epoch": 0.39686609346864304, "grad_norm": 0.14881809055805206, "learning_rate": 0.001, "loss": 2.1764, "num_input_tokens_seen": 21806827168, "step": 41600 }, { "epoch": 0.3973430959848313, "grad_norm": 0.15954989194869995, "learning_rate": 0.001, "loss": 2.195, "num_input_tokens_seen": 21833038464, "step": 41650 }, { "epoch": 0.39782009850101957, "grad_norm": 0.14994557201862335, "learning_rate": 0.001, "loss": 2.1934, "num_input_tokens_seen": 21859246336, "step": 41700 }, { "epoch": 0.3982971010172079, "grad_norm": 0.1431296467781067, "learning_rate": 0.001, "loss": 2.1858, "num_input_tokens_seen": 21885459520, "step": 41750 }, { "epoch": 0.39877410353339615, "grad_norm": 0.1418553739786148, "learning_rate": 0.001, "loss": 2.1859, "num_input_tokens_seen": 21911671392, "step": 41800 }, { "epoch": 0.3992511060495844, "grad_norm": 0.1425972878932953, "learning_rate": 0.001, "loss": 2.1917, "num_input_tokens_seen": 21937879584, "step": 41850 }, { "epoch": 0.3997281085657727, "grad_norm": 0.13912352919578552, "learning_rate": 0.001, "loss": 2.19, "num_input_tokens_seen": 21964082080, "step": 41900 }, { "epoch": 0.40020511108196094, "grad_norm": 0.16832081973552704, "learning_rate": 0.001, "loss": 2.181, "num_input_tokens_seen": 21990294784, "step": 41950 }, { "epoch": 0.4006821135981492, "grad_norm": 0.14969058334827423, "learning_rate": 0.001, "loss": 2.1834, "num_input_tokens_seen": 22016505376, "step": 42000 }, { "epoch": 0.4006821135981492, "eval_loss": 2.1013541221618652, "eval_runtime": 82.4496, "eval_samples_per_second": 60.643, "eval_steps_per_second": 15.161, "num_input_tokens_seen": 22016505376, "step": 42000 }, { "epoch": 0.4011591161143375, "grad_norm": 0.14711995422840118, "learning_rate": 0.001, "loss": 2.1871, "num_input_tokens_seen": 22042718080, "step": 42050 }, { "epoch": 0.4016361186305258, "grad_norm": 0.14118188619613647, "learning_rate": 0.001, "loss": 2.1968, "num_input_tokens_seen": 22068929760, "step": 42100 }, { "epoch": 0.40211312114671405, "grad_norm": 0.14557373523712158, "learning_rate": 0.001, "loss": 2.1815, "num_input_tokens_seen": 22095140192, "step": 42150 }, { "epoch": 0.4025901236629023, "grad_norm": 0.14241378009319305, "learning_rate": 0.001, "loss": 2.1841, "num_input_tokens_seen": 22121353760, "step": 42200 }, { "epoch": 0.4030671261790906, "grad_norm": 0.14654819667339325, "learning_rate": 0.001, "loss": 2.1857, "num_input_tokens_seen": 22147565312, "step": 42250 }, { "epoch": 0.4035441286952789, "grad_norm": 0.14023630321025848, "learning_rate": 0.001, "loss": 2.185, "num_input_tokens_seen": 22173773088, "step": 42300 }, { "epoch": 0.40402113121146715, "grad_norm": 0.1503324657678604, "learning_rate": 0.001, "loss": 2.1719, "num_input_tokens_seen": 22199972064, "step": 42350 }, { "epoch": 0.4044981337276554, "grad_norm": 0.148145854473114, "learning_rate": 0.001, "loss": 2.1854, "num_input_tokens_seen": 22226182336, "step": 42400 }, { "epoch": 0.4049751362438437, "grad_norm": 0.14223705232143402, "learning_rate": 0.001, "loss": 2.1941, "num_input_tokens_seen": 22252394144, "step": 42450 }, { "epoch": 0.40545213876003194, "grad_norm": 0.15040171146392822, "learning_rate": 0.001, "loss": 2.1914, "num_input_tokens_seen": 22278605888, "step": 42500 }, { "epoch": 0.40545213876003194, "eval_loss": 2.101572036743164, "eval_runtime": 82.9065, "eval_samples_per_second": 60.309, "eval_steps_per_second": 15.077, "num_input_tokens_seen": 22278605888, "step": 42500 }, { "epoch": 0.4059291412762202, "grad_norm": 0.1579235941171646, "learning_rate": 0.001, "loss": 2.1765, "num_input_tokens_seen": 22304819520, "step": 42550 }, { "epoch": 0.4064061437924085, "grad_norm": 0.16040007770061493, "learning_rate": 0.001, "loss": 2.1853, "num_input_tokens_seen": 22331025472, "step": 42600 }, { "epoch": 0.4068831463085968, "grad_norm": 0.14831505715847015, "learning_rate": 0.001, "loss": 2.1894, "num_input_tokens_seen": 22357235040, "step": 42650 }, { "epoch": 0.40736014882478505, "grad_norm": 0.1373136043548584, "learning_rate": 0.001, "loss": 2.1872, "num_input_tokens_seen": 22383446240, "step": 42700 }, { "epoch": 0.4078371513409733, "grad_norm": 0.13545425236225128, "learning_rate": 0.001, "loss": 2.1964, "num_input_tokens_seen": 22409660576, "step": 42750 }, { "epoch": 0.4083141538571616, "grad_norm": 0.1480574756860733, "learning_rate": 0.001, "loss": 2.1892, "num_input_tokens_seen": 22435870080, "step": 42800 }, { "epoch": 0.4087911563733499, "grad_norm": 0.14833049476146698, "learning_rate": 0.001, "loss": 2.1793, "num_input_tokens_seen": 22462079680, "step": 42850 }, { "epoch": 0.40926815888953816, "grad_norm": 0.1327161192893982, "learning_rate": 0.001, "loss": 2.1834, "num_input_tokens_seen": 22488293312, "step": 42900 }, { "epoch": 0.4097451614057264, "grad_norm": 0.15992066264152527, "learning_rate": 0.001, "loss": 2.1759, "num_input_tokens_seen": 22514505696, "step": 42950 }, { "epoch": 0.4102221639219147, "grad_norm": 0.14314264059066772, "learning_rate": 0.001, "loss": 2.1932, "num_input_tokens_seen": 22540715296, "step": 43000 }, { "epoch": 0.4102221639219147, "eval_loss": 2.0989809036254883, "eval_runtime": 82.6516, "eval_samples_per_second": 60.495, "eval_steps_per_second": 15.124, "num_input_tokens_seen": 22540715296, "step": 43000 }, { "epoch": 0.41069916643810295, "grad_norm": 0.14374396204948425, "learning_rate": 0.001, "loss": 2.1741, "num_input_tokens_seen": 22566918720, "step": 43050 }, { "epoch": 0.4111761689542912, "grad_norm": 0.14959892630577087, "learning_rate": 0.001, "loss": 2.1778, "num_input_tokens_seen": 22593129408, "step": 43100 }, { "epoch": 0.41165317147047953, "grad_norm": 0.13896231353282928, "learning_rate": 0.001, "loss": 2.1778, "num_input_tokens_seen": 22619343808, "step": 43150 }, { "epoch": 0.4121301739866678, "grad_norm": 0.14940877258777618, "learning_rate": 0.001, "loss": 2.1903, "num_input_tokens_seen": 22645555552, "step": 43200 }, { "epoch": 0.41260717650285605, "grad_norm": 0.14699922502040863, "learning_rate": 0.001, "loss": 2.1882, "num_input_tokens_seen": 22671765856, "step": 43250 }, { "epoch": 0.4130841790190443, "grad_norm": 0.13644367456436157, "learning_rate": 0.001, "loss": 2.186, "num_input_tokens_seen": 22697978816, "step": 43300 }, { "epoch": 0.4135611815352326, "grad_norm": 0.13732574880123138, "learning_rate": 0.001, "loss": 2.1823, "num_input_tokens_seen": 22724189600, "step": 43350 }, { "epoch": 0.4140381840514209, "grad_norm": 0.20614680647850037, "learning_rate": 0.001, "loss": 2.204, "num_input_tokens_seen": 22750393024, "step": 43400 }, { "epoch": 0.41451518656760916, "grad_norm": 0.3353808522224426, "learning_rate": 0.001, "loss": 2.2219, "num_input_tokens_seen": 22776601856, "step": 43450 }, { "epoch": 0.4149921890837974, "grad_norm": 0.12795452773571014, "learning_rate": 0.001, "loss": 2.2209, "num_input_tokens_seen": 22802815776, "step": 43500 }, { "epoch": 0.4149921890837974, "eval_loss": 2.1085610389709473, "eval_runtime": 81.8271, "eval_samples_per_second": 61.104, "eval_steps_per_second": 15.276, "num_input_tokens_seen": 22802815776, "step": 43500 }, { "epoch": 0.4154691915999857, "grad_norm": 0.1316380500793457, "learning_rate": 0.001, "loss": 2.1859, "num_input_tokens_seen": 22829029248, "step": 43550 }, { "epoch": 0.41594619411617395, "grad_norm": 0.14432553946971893, "learning_rate": 0.001, "loss": 2.1942, "num_input_tokens_seen": 22855233984, "step": 43600 }, { "epoch": 0.4164231966323622, "grad_norm": 0.14184366166591644, "learning_rate": 0.001, "loss": 2.1843, "num_input_tokens_seen": 22881440160, "step": 43650 }, { "epoch": 0.41690019914855053, "grad_norm": 0.13516154885292053, "learning_rate": 0.001, "loss": 2.1828, "num_input_tokens_seen": 22907643456, "step": 43700 }, { "epoch": 0.4173772016647388, "grad_norm": 0.14672012627124786, "learning_rate": 0.001, "loss": 2.1828, "num_input_tokens_seen": 22933857824, "step": 43750 }, { "epoch": 0.41785420418092706, "grad_norm": 0.15259918570518494, "learning_rate": 0.001, "loss": 2.1908, "num_input_tokens_seen": 22960061120, "step": 43800 }, { "epoch": 0.4183312066971153, "grad_norm": 0.1380903720855713, "learning_rate": 0.001, "loss": 2.1685, "num_input_tokens_seen": 22986272704, "step": 43850 }, { "epoch": 0.4188082092133036, "grad_norm": 0.149773970246315, "learning_rate": 0.001, "loss": 2.1773, "num_input_tokens_seen": 23012484064, "step": 43900 }, { "epoch": 0.41928521172949185, "grad_norm": 0.13098189234733582, "learning_rate": 0.001, "loss": 2.1854, "num_input_tokens_seen": 23038695008, "step": 43950 }, { "epoch": 0.41976221424568017, "grad_norm": 0.17141763865947723, "learning_rate": 0.001, "loss": 2.1856, "num_input_tokens_seen": 23064909408, "step": 44000 }, { "epoch": 0.41976221424568017, "eval_loss": 2.0980746746063232, "eval_runtime": 82.0381, "eval_samples_per_second": 60.947, "eval_steps_per_second": 15.237, "num_input_tokens_seen": 23064909408, "step": 44000 }, { "epoch": 0.42023921676186843, "grad_norm": 0.12846800684928894, "learning_rate": 0.001, "loss": 2.1901, "num_input_tokens_seen": 23091122912, "step": 44050 }, { "epoch": 0.4207162192780567, "grad_norm": 0.14414989948272705, "learning_rate": 0.001, "loss": 2.1852, "num_input_tokens_seen": 23117335840, "step": 44100 }, { "epoch": 0.42119322179424495, "grad_norm": 0.13725394010543823, "learning_rate": 0.001, "loss": 2.192, "num_input_tokens_seen": 23143542976, "step": 44150 }, { "epoch": 0.4216702243104332, "grad_norm": 0.14113777875900269, "learning_rate": 0.001, "loss": 2.171, "num_input_tokens_seen": 23169753472, "step": 44200 }, { "epoch": 0.42214722682662154, "grad_norm": 0.1349649280309677, "learning_rate": 0.001, "loss": 2.174, "num_input_tokens_seen": 23195956960, "step": 44250 }, { "epoch": 0.4226242293428098, "grad_norm": 0.13642828166484833, "learning_rate": 0.001, "loss": 2.1926, "num_input_tokens_seen": 23222168320, "step": 44300 }, { "epoch": 0.42310123185899806, "grad_norm": 0.15120643377304077, "learning_rate": 0.001, "loss": 2.1745, "num_input_tokens_seen": 23248378144, "step": 44350 }, { "epoch": 0.4235782343751863, "grad_norm": 0.13176341354846954, "learning_rate": 0.001, "loss": 2.1863, "num_input_tokens_seen": 23274588960, "step": 44400 }, { "epoch": 0.4240552368913746, "grad_norm": 0.14402221143245697, "learning_rate": 0.001, "loss": 2.1782, "num_input_tokens_seen": 23300803360, "step": 44450 }, { "epoch": 0.42453223940756285, "grad_norm": 0.14719614386558533, "learning_rate": 0.001, "loss": 2.1823, "num_input_tokens_seen": 23327017760, "step": 44500 }, { "epoch": 0.42453223940756285, "eval_loss": 2.0960397720336914, "eval_runtime": 82.6503, "eval_samples_per_second": 60.496, "eval_steps_per_second": 15.124, "num_input_tokens_seen": 23327017760, "step": 44500 }, { "epoch": 0.42500924192375117, "grad_norm": 0.14565804600715637, "learning_rate": 0.001, "loss": 2.1762, "num_input_tokens_seen": 23353225568, "step": 44550 }, { "epoch": 0.42548624443993943, "grad_norm": 0.13152356445789337, "learning_rate": 0.001, "loss": 2.1789, "num_input_tokens_seen": 23379439744, "step": 44600 }, { "epoch": 0.4259632469561277, "grad_norm": 0.1694796234369278, "learning_rate": 0.001, "loss": 2.1851, "num_input_tokens_seen": 23405652928, "step": 44650 }, { "epoch": 0.42644024947231596, "grad_norm": 0.14656352996826172, "learning_rate": 0.001, "loss": 2.1733, "num_input_tokens_seen": 23431859328, "step": 44700 }, { "epoch": 0.4269172519885042, "grad_norm": 0.15425816178321838, "learning_rate": 0.001, "loss": 2.1853, "num_input_tokens_seen": 23458067136, "step": 44750 }, { "epoch": 0.42739425450469254, "grad_norm": 0.14381302893161774, "learning_rate": 0.001, "loss": 2.1893, "num_input_tokens_seen": 23484275232, "step": 44800 }, { "epoch": 0.4278712570208808, "grad_norm": 0.14928653836250305, "learning_rate": 0.001, "loss": 2.1807, "num_input_tokens_seen": 23510489632, "step": 44850 }, { "epoch": 0.42834825953706906, "grad_norm": 0.15223214030265808, "learning_rate": 0.001, "loss": 2.1714, "num_input_tokens_seen": 23536691232, "step": 44900 }, { "epoch": 0.42882526205325733, "grad_norm": 0.14738094806671143, "learning_rate": 0.001, "loss": 2.17, "num_input_tokens_seen": 23562905280, "step": 44950 }, { "epoch": 0.4293022645694456, "grad_norm": 0.14292684197425842, "learning_rate": 0.001, "loss": 2.1862, "num_input_tokens_seen": 23589115072, "step": 45000 }, { "epoch": 0.4293022645694456, "eval_loss": 2.093562126159668, "eval_runtime": 82.6691, "eval_samples_per_second": 60.482, "eval_steps_per_second": 15.121, "num_input_tokens_seen": 23589115072, "step": 45000 }, { "epoch": 0.42977926708563385, "grad_norm": 0.142947256565094, "learning_rate": 0.001, "loss": 2.1711, "num_input_tokens_seen": 23615324192, "step": 45050 }, { "epoch": 0.43025626960182217, "grad_norm": 0.14863619208335876, "learning_rate": 0.001, "loss": 2.1782, "num_input_tokens_seen": 23641537056, "step": 45100 }, { "epoch": 0.43073327211801044, "grad_norm": 0.1470208466053009, "learning_rate": 0.001, "loss": 2.1829, "num_input_tokens_seen": 23667747168, "step": 45150 }, { "epoch": 0.4312102746341987, "grad_norm": 0.1326986402273178, "learning_rate": 0.001, "loss": 2.1844, "num_input_tokens_seen": 23693952480, "step": 45200 }, { "epoch": 0.43168727715038696, "grad_norm": 0.14702260494232178, "learning_rate": 0.001, "loss": 2.1752, "num_input_tokens_seen": 23720163968, "step": 45250 }, { "epoch": 0.4321642796665752, "grad_norm": 0.14227628707885742, "learning_rate": 0.001, "loss": 2.1794, "num_input_tokens_seen": 23746374976, "step": 45300 }, { "epoch": 0.4326412821827635, "grad_norm": 0.15879526734352112, "learning_rate": 0.001, "loss": 2.1892, "num_input_tokens_seen": 23772583744, "step": 45350 }, { "epoch": 0.4331182846989518, "grad_norm": 0.14691776037216187, "learning_rate": 0.001, "loss": 2.1887, "num_input_tokens_seen": 23798794624, "step": 45400 }, { "epoch": 0.43359528721514007, "grad_norm": 0.1442701518535614, "learning_rate": 0.001, "loss": 2.1744, "num_input_tokens_seen": 23824995936, "step": 45450 }, { "epoch": 0.43407228973132833, "grad_norm": 0.14576993882656097, "learning_rate": 0.001, "loss": 2.182, "num_input_tokens_seen": 23851210336, "step": 45500 }, { "epoch": 0.43407228973132833, "eval_loss": 2.092682123184204, "eval_runtime": 82.1558, "eval_samples_per_second": 60.86, "eval_steps_per_second": 15.215, "num_input_tokens_seen": 23851210336, "step": 45500 }, { "epoch": 0.4345492922475166, "grad_norm": 0.13830795884132385, "learning_rate": 0.001, "loss": 2.1708, "num_input_tokens_seen": 23877410144, "step": 45550 }, { "epoch": 0.43502629476370486, "grad_norm": 0.15177521109580994, "learning_rate": 0.001, "loss": 2.1688, "num_input_tokens_seen": 23903612352, "step": 45600 }, { "epoch": 0.4355032972798932, "grad_norm": 0.15547911822795868, "learning_rate": 0.001, "loss": 2.1825, "num_input_tokens_seen": 23929814720, "step": 45650 }, { "epoch": 0.43598029979608144, "grad_norm": 0.14573921263217926, "learning_rate": 0.001, "loss": 2.1749, "num_input_tokens_seen": 23956024000, "step": 45700 }, { "epoch": 0.4364573023122697, "grad_norm": 0.14333628118038177, "learning_rate": 0.001, "loss": 2.1751, "num_input_tokens_seen": 23982236352, "step": 45750 }, { "epoch": 0.43693430482845796, "grad_norm": 0.15511895716190338, "learning_rate": 0.001, "loss": 2.1775, "num_input_tokens_seen": 24008446560, "step": 45800 }, { "epoch": 0.4374113073446462, "grad_norm": 0.14994923770427704, "learning_rate": 0.001, "loss": 2.181, "num_input_tokens_seen": 24034660960, "step": 45850 }, { "epoch": 0.4378883098608345, "grad_norm": 0.13673779368400574, "learning_rate": 0.001, "loss": 2.1658, "num_input_tokens_seen": 24060863552, "step": 45900 }, { "epoch": 0.4383653123770228, "grad_norm": 0.13953204452991486, "learning_rate": 0.001, "loss": 2.1705, "num_input_tokens_seen": 24087066784, "step": 45950 }, { "epoch": 0.43884231489321107, "grad_norm": 0.14653468132019043, "learning_rate": 0.001, "loss": 2.1729, "num_input_tokens_seen": 24113281184, "step": 46000 }, { "epoch": 0.43884231489321107, "eval_loss": 2.091700792312622, "eval_runtime": 82.8172, "eval_samples_per_second": 60.374, "eval_steps_per_second": 15.093, "num_input_tokens_seen": 24113281184, "step": 46000 }, { "epoch": 0.43931931740939933, "grad_norm": 0.1543819010257721, "learning_rate": 0.001, "loss": 2.1704, "num_input_tokens_seen": 24139486272, "step": 46050 }, { "epoch": 0.4397963199255876, "grad_norm": 0.15496985614299774, "learning_rate": 0.001, "loss": 2.1818, "num_input_tokens_seen": 24165699840, "step": 46100 }, { "epoch": 0.44027332244177586, "grad_norm": 0.15104669332504272, "learning_rate": 0.001, "loss": 2.1816, "num_input_tokens_seen": 24191906496, "step": 46150 }, { "epoch": 0.4407503249579642, "grad_norm": 0.14507949352264404, "learning_rate": 0.001, "loss": 2.1793, "num_input_tokens_seen": 24218118464, "step": 46200 }, { "epoch": 0.44122732747415244, "grad_norm": 0.14487695693969727, "learning_rate": 0.001, "loss": 2.1673, "num_input_tokens_seen": 24244332864, "step": 46250 }, { "epoch": 0.4417043299903407, "grad_norm": 0.1322576105594635, "learning_rate": 0.001, "loss": 2.1752, "num_input_tokens_seen": 24270547264, "step": 46300 }, { "epoch": 0.44218133250652897, "grad_norm": 0.13863323628902435, "learning_rate": 0.001, "loss": 2.1724, "num_input_tokens_seen": 24296760224, "step": 46350 }, { "epoch": 0.44265833502271723, "grad_norm": 0.1451748162508011, "learning_rate": 0.001, "loss": 2.1739, "num_input_tokens_seen": 24322974624, "step": 46400 }, { "epoch": 0.4431353375389055, "grad_norm": 0.15124155580997467, "learning_rate": 0.001, "loss": 2.1692, "num_input_tokens_seen": 24349183648, "step": 46450 }, { "epoch": 0.4436123400550938, "grad_norm": 0.14303581416606903, "learning_rate": 0.001, "loss": 2.177, "num_input_tokens_seen": 24375397792, "step": 46500 }, { "epoch": 0.4436123400550938, "eval_loss": 2.0904366970062256, "eval_runtime": 82.436, "eval_samples_per_second": 60.653, "eval_steps_per_second": 15.163, "num_input_tokens_seen": 24375397792, "step": 46500 }, { "epoch": 0.4440893425712821, "grad_norm": 0.14103703200817108, "learning_rate": 0.001, "loss": 2.1682, "num_input_tokens_seen": 24401608192, "step": 46550 }, { "epoch": 0.44456634508747034, "grad_norm": 0.1284860521554947, "learning_rate": 0.001, "loss": 2.1704, "num_input_tokens_seen": 24427820992, "step": 46600 }, { "epoch": 0.4450433476036586, "grad_norm": 0.1443055421113968, "learning_rate": 0.001, "loss": 2.1686, "num_input_tokens_seen": 24454025824, "step": 46650 }, { "epoch": 0.44552035011984686, "grad_norm": 0.1435597836971283, "learning_rate": 0.001, "loss": 2.1806, "num_input_tokens_seen": 24480239680, "step": 46700 }, { "epoch": 0.4459973526360352, "grad_norm": 0.15132416784763336, "learning_rate": 0.001, "loss": 2.1737, "num_input_tokens_seen": 24506453920, "step": 46750 }, { "epoch": 0.44647435515222345, "grad_norm": 0.1403588205575943, "learning_rate": 0.001, "loss": 2.1701, "num_input_tokens_seen": 24532664288, "step": 46800 }, { "epoch": 0.4469513576684117, "grad_norm": 0.15247014164924622, "learning_rate": 0.001, "loss": 2.1694, "num_input_tokens_seen": 24558878688, "step": 46850 }, { "epoch": 0.44742836018459997, "grad_norm": 0.14112932980060577, "learning_rate": 0.001, "loss": 2.1797, "num_input_tokens_seen": 24585085216, "step": 46900 }, { "epoch": 0.44790536270078823, "grad_norm": 0.14278770983219147, "learning_rate": 0.001, "loss": 2.1671, "num_input_tokens_seen": 24611299616, "step": 46950 }, { "epoch": 0.4483823652169765, "grad_norm": 0.15718604624271393, "learning_rate": 0.001, "loss": 2.1674, "num_input_tokens_seen": 24637513248, "step": 47000 }, { "epoch": 0.4483823652169765, "eval_loss": 2.08896541595459, "eval_runtime": 81.7194, "eval_samples_per_second": 61.185, "eval_steps_per_second": 15.296, "num_input_tokens_seen": 24637513248, "step": 47000 }, { "epoch": 0.4488593677331648, "grad_norm": 0.14600330591201782, "learning_rate": 0.001, "loss": 2.1773, "num_input_tokens_seen": 24663726880, "step": 47050 }, { "epoch": 0.4493363702493531, "grad_norm": 0.13896551728248596, "learning_rate": 0.001, "loss": 2.1699, "num_input_tokens_seen": 24689934976, "step": 47100 }, { "epoch": 0.44981337276554134, "grad_norm": 0.15189655125141144, "learning_rate": 0.001, "loss": 2.1747, "num_input_tokens_seen": 24716146208, "step": 47150 }, { "epoch": 0.4502903752817296, "grad_norm": 0.1438799947500229, "learning_rate": 0.001, "loss": 2.1754, "num_input_tokens_seen": 24742351360, "step": 47200 }, { "epoch": 0.45076737779791787, "grad_norm": 0.14087191224098206, "learning_rate": 0.001, "loss": 2.1659, "num_input_tokens_seen": 24768557056, "step": 47250 }, { "epoch": 0.45124438031410613, "grad_norm": 0.1569574773311615, "learning_rate": 0.001, "loss": 2.1765, "num_input_tokens_seen": 24794768736, "step": 47300 }, { "epoch": 0.45172138283029445, "grad_norm": 0.14594893157482147, "learning_rate": 0.001, "loss": 2.1867, "num_input_tokens_seen": 24820973728, "step": 47350 }, { "epoch": 0.4521983853464827, "grad_norm": 0.13743354380130768, "learning_rate": 0.001, "loss": 2.1671, "num_input_tokens_seen": 24847180800, "step": 47400 }, { "epoch": 0.452675387862671, "grad_norm": 0.14880713820457458, "learning_rate": 0.001, "loss": 2.1834, "num_input_tokens_seen": 24873395200, "step": 47450 }, { "epoch": 0.45315239037885924, "grad_norm": 0.13658978044986725, "learning_rate": 0.001, "loss": 2.1608, "num_input_tokens_seen": 24899608000, "step": 47500 }, { "epoch": 0.45315239037885924, "eval_loss": 2.0886528491973877, "eval_runtime": 82.7799, "eval_samples_per_second": 60.401, "eval_steps_per_second": 15.1, "num_input_tokens_seen": 24899608000, "step": 47500 }, { "epoch": 0.4536293928950475, "grad_norm": 0.14707359671592712, "learning_rate": 0.001, "loss": 2.172, "num_input_tokens_seen": 24925815680, "step": 47550 }, { "epoch": 0.4541063954112358, "grad_norm": 0.16340535879135132, "learning_rate": 0.001, "loss": 2.1721, "num_input_tokens_seen": 24952024960, "step": 47600 }, { "epoch": 0.4545833979274241, "grad_norm": 0.14133617281913757, "learning_rate": 0.001, "loss": 2.1682, "num_input_tokens_seen": 24978238080, "step": 47650 }, { "epoch": 0.45506040044361235, "grad_norm": 0.14507652819156647, "learning_rate": 0.001, "loss": 2.1717, "num_input_tokens_seen": 25004442496, "step": 47700 }, { "epoch": 0.4555374029598006, "grad_norm": 0.1635296642780304, "learning_rate": 0.001, "loss": 2.1722, "num_input_tokens_seen": 25030655840, "step": 47750 }, { "epoch": 0.45601440547598887, "grad_norm": 0.15049296617507935, "learning_rate": 0.001, "loss": 2.1647, "num_input_tokens_seen": 25056870240, "step": 47800 }, { "epoch": 0.45649140799217713, "grad_norm": 0.14016319811344147, "learning_rate": 0.001, "loss": 2.3042, "num_input_tokens_seen": 25083083712, "step": 47850 }, { "epoch": 0.45696841050836545, "grad_norm": 0.1369781345129013, "learning_rate": 0.001, "loss": 2.21, "num_input_tokens_seen": 25109294720, "step": 47900 }, { "epoch": 0.4574454130245537, "grad_norm": 0.13268031179904938, "learning_rate": 0.001, "loss": 2.1809, "num_input_tokens_seen": 25135504256, "step": 47950 }, { "epoch": 0.457922415540742, "grad_norm": 0.13591749966144562, "learning_rate": 0.001, "loss": 2.1808, "num_input_tokens_seen": 25161718656, "step": 48000 }, { "epoch": 0.457922415540742, "eval_loss": 2.0938363075256348, "eval_runtime": 81.9703, "eval_samples_per_second": 60.998, "eval_steps_per_second": 15.249, "num_input_tokens_seen": 25161718656, "step": 48000 }, { "epoch": 0.45839941805693024, "grad_norm": 0.13940733671188354, "learning_rate": 0.001, "loss": 2.174, "num_input_tokens_seen": 25187922848, "step": 48050 }, { "epoch": 0.4588764205731185, "grad_norm": 0.16502974927425385, "learning_rate": 0.001, "loss": 2.1807, "num_input_tokens_seen": 25214132864, "step": 48100 }, { "epoch": 0.4593534230893068, "grad_norm": 0.15250737965106964, "learning_rate": 0.001, "loss": 2.1831, "num_input_tokens_seen": 25240339520, "step": 48150 }, { "epoch": 0.4598304256054951, "grad_norm": 0.14336740970611572, "learning_rate": 0.001, "loss": 2.175, "num_input_tokens_seen": 25266553920, "step": 48200 }, { "epoch": 0.46030742812168335, "grad_norm": 0.1376286745071411, "learning_rate": 0.001, "loss": 2.1733, "num_input_tokens_seen": 25292766560, "step": 48250 }, { "epoch": 0.4607844306378716, "grad_norm": 0.1339864432811737, "learning_rate": 0.001, "loss": 2.1667, "num_input_tokens_seen": 25318970496, "step": 48300 }, { "epoch": 0.4612614331540599, "grad_norm": 0.14675366878509521, "learning_rate": 0.001, "loss": 2.1784, "num_input_tokens_seen": 25345180512, "step": 48350 }, { "epoch": 0.46173843567024814, "grad_norm": 0.14352139830589294, "learning_rate": 0.001, "loss": 2.1915, "num_input_tokens_seen": 25371386368, "step": 48400 }, { "epoch": 0.46221543818643646, "grad_norm": 0.14589083194732666, "learning_rate": 0.001, "loss": 2.1692, "num_input_tokens_seen": 25397588192, "step": 48450 }, { "epoch": 0.4626924407026247, "grad_norm": 0.1392335146665573, "learning_rate": 0.001, "loss": 2.1811, "num_input_tokens_seen": 25423801984, "step": 48500 }, { "epoch": 0.4626924407026247, "eval_loss": 2.0870039463043213, "eval_runtime": 82.4574, "eval_samples_per_second": 60.637, "eval_steps_per_second": 15.159, "num_input_tokens_seen": 25423801984, "step": 48500 }, { "epoch": 0.463169443218813, "grad_norm": 0.14096789062023163, "learning_rate": 0.001, "loss": 2.1822, "num_input_tokens_seen": 25450016384, "step": 48550 }, { "epoch": 0.46364644573500124, "grad_norm": 0.13657501339912415, "learning_rate": 0.001, "loss": 2.1633, "num_input_tokens_seen": 25476223712, "step": 48600 }, { "epoch": 0.4641234482511895, "grad_norm": 0.1375761330127716, "learning_rate": 0.001, "loss": 2.1601, "num_input_tokens_seen": 25502435136, "step": 48650 }, { "epoch": 0.46460045076737777, "grad_norm": 0.13810068368911743, "learning_rate": 0.001, "loss": 2.1651, "num_input_tokens_seen": 25528648192, "step": 48700 }, { "epoch": 0.4650774532835661, "grad_norm": 0.1375926285982132, "learning_rate": 0.001, "loss": 2.1766, "num_input_tokens_seen": 25554860256, "step": 48750 }, { "epoch": 0.46555445579975435, "grad_norm": 0.14654815196990967, "learning_rate": 0.001, "loss": 2.1634, "num_input_tokens_seen": 25581068864, "step": 48800 }, { "epoch": 0.4660314583159426, "grad_norm": 0.1339625120162964, "learning_rate": 0.001, "loss": 2.1681, "num_input_tokens_seen": 25607278112, "step": 48850 }, { "epoch": 0.4665084608321309, "grad_norm": 0.13390694558620453, "learning_rate": 0.001, "loss": 2.1789, "num_input_tokens_seen": 25633491968, "step": 48900 }, { "epoch": 0.46698546334831914, "grad_norm": 0.14397822320461273, "learning_rate": 0.001, "loss": 2.1525, "num_input_tokens_seen": 25659705568, "step": 48950 }, { "epoch": 0.46746246586450746, "grad_norm": 0.12739968299865723, "learning_rate": 0.001, "loss": 2.1621, "num_input_tokens_seen": 25685912544, "step": 49000 }, { "epoch": 0.46746246586450746, "eval_loss": 2.0851972103118896, "eval_runtime": 82.4678, "eval_samples_per_second": 60.63, "eval_steps_per_second": 15.157, "num_input_tokens_seen": 25685912544, "step": 49000 }, { "epoch": 0.4679394683806957, "grad_norm": 0.14692357182502747, "learning_rate": 0.001, "loss": 2.1643, "num_input_tokens_seen": 25712121760, "step": 49050 }, { "epoch": 0.468416470896884, "grad_norm": 0.13649721443653107, "learning_rate": 0.001, "loss": 2.1685, "num_input_tokens_seen": 25738335680, "step": 49100 }, { "epoch": 0.46889347341307225, "grad_norm": 0.1307746022939682, "learning_rate": 0.001, "loss": 2.1742, "num_input_tokens_seen": 25764544640, "step": 49150 }, { "epoch": 0.4693704759292605, "grad_norm": 0.1445266157388687, "learning_rate": 0.001, "loss": 2.1758, "num_input_tokens_seen": 25790759040, "step": 49200 }, { "epoch": 0.4698474784454488, "grad_norm": 0.14383389055728912, "learning_rate": 0.001, "loss": 2.1634, "num_input_tokens_seen": 25816971648, "step": 49250 }, { "epoch": 0.4703244809616371, "grad_norm": 0.1344735324382782, "learning_rate": 0.001, "loss": 2.1748, "num_input_tokens_seen": 25843179040, "step": 49300 }, { "epoch": 0.47080148347782536, "grad_norm": 0.1436392366886139, "learning_rate": 0.001, "loss": 2.1739, "num_input_tokens_seen": 25869388544, "step": 49350 }, { "epoch": 0.4712784859940136, "grad_norm": 0.14839567244052887, "learning_rate": 0.001, "loss": 2.1645, "num_input_tokens_seen": 25895600800, "step": 49400 }, { "epoch": 0.4717554885102019, "grad_norm": 0.14000116288661957, "learning_rate": 0.001, "loss": 2.1754, "num_input_tokens_seen": 25921808160, "step": 49450 }, { "epoch": 0.47223249102639014, "grad_norm": 0.1437309980392456, "learning_rate": 0.001, "loss": 2.1722, "num_input_tokens_seen": 25948022560, "step": 49500 }, { "epoch": 0.47223249102639014, "eval_loss": 2.083247184753418, "eval_runtime": 82.7627, "eval_samples_per_second": 60.414, "eval_steps_per_second": 15.103, "num_input_tokens_seen": 25948022560, "step": 49500 }, { "epoch": 0.47270949354257846, "grad_norm": 0.14076103270053864, "learning_rate": 0.001, "loss": 2.1761, "num_input_tokens_seen": 25974234496, "step": 49550 }, { "epoch": 0.4731864960587667, "grad_norm": 0.13715969026088715, "learning_rate": 0.001, "loss": 2.1728, "num_input_tokens_seen": 26000448896, "step": 49600 }, { "epoch": 0.473663498574955, "grad_norm": 0.14823545515537262, "learning_rate": 0.001, "loss": 2.1646, "num_input_tokens_seen": 26026663296, "step": 49650 }, { "epoch": 0.47414050109114325, "grad_norm": 0.1491384655237198, "learning_rate": 0.001, "loss": 2.1674, "num_input_tokens_seen": 26052869728, "step": 49700 }, { "epoch": 0.4746175036073315, "grad_norm": 0.13799893856048584, "learning_rate": 0.001, "loss": 2.1577, "num_input_tokens_seen": 26079080768, "step": 49750 }, { "epoch": 0.4750945061235198, "grad_norm": 0.1610012948513031, "learning_rate": 0.001, "loss": 2.1546, "num_input_tokens_seen": 26105295168, "step": 49800 }, { "epoch": 0.4755715086397081, "grad_norm": 0.13887785375118256, "learning_rate": 0.001, "loss": 2.1642, "num_input_tokens_seen": 26131495392, "step": 49850 }, { "epoch": 0.47604851115589636, "grad_norm": 0.14724285900592804, "learning_rate": 0.001, "loss": 2.167, "num_input_tokens_seen": 26157704320, "step": 49900 }, { "epoch": 0.4765255136720846, "grad_norm": 0.13931205868721008, "learning_rate": 0.001, "loss": 2.1665, "num_input_tokens_seen": 26183918720, "step": 49950 }, { "epoch": 0.4770025161882729, "grad_norm": 0.13653016090393066, "learning_rate": 0.001, "loss": 2.1745, "num_input_tokens_seen": 26210133120, "step": 50000 }, { "epoch": 0.4770025161882729, "eval_loss": 2.0823795795440674, "eval_runtime": 82.2807, "eval_samples_per_second": 60.768, "eval_steps_per_second": 15.192, "num_input_tokens_seen": 26210133120, "step": 50000 }, { "epoch": 0.47747951870446115, "grad_norm": 0.14365418255329132, "learning_rate": 0.001, "loss": 2.1567, "num_input_tokens_seen": 26236339488, "step": 50050 }, { "epoch": 0.47795652122064947, "grad_norm": 0.1386982500553131, "learning_rate": 0.001, "loss": 2.158, "num_input_tokens_seen": 26262549248, "step": 50100 }, { "epoch": 0.47843352373683773, "grad_norm": 0.1469505876302719, "learning_rate": 0.001, "loss": 2.1654, "num_input_tokens_seen": 26288757120, "step": 50150 }, { "epoch": 0.478910526253026, "grad_norm": 0.1320936232805252, "learning_rate": 0.001, "loss": 2.167, "num_input_tokens_seen": 26314968160, "step": 50200 }, { "epoch": 0.47938752876921426, "grad_norm": 0.14790290594100952, "learning_rate": 0.001, "loss": 2.1601, "num_input_tokens_seen": 26341180480, "step": 50250 }, { "epoch": 0.4798645312854025, "grad_norm": 0.14135821163654327, "learning_rate": 0.001, "loss": 2.1659, "num_input_tokens_seen": 26367381728, "step": 50300 }, { "epoch": 0.4803415338015908, "grad_norm": 0.13028793036937714, "learning_rate": 0.001, "loss": 2.1619, "num_input_tokens_seen": 26393594272, "step": 50350 }, { "epoch": 0.4808185363177791, "grad_norm": 0.16743403673171997, "learning_rate": 0.001, "loss": 2.1674, "num_input_tokens_seen": 26419802944, "step": 50400 }, { "epoch": 0.48129553883396736, "grad_norm": 0.145367830991745, "learning_rate": 0.001, "loss": 2.1681, "num_input_tokens_seen": 26446014976, "step": 50450 }, { "epoch": 0.4817725413501556, "grad_norm": 0.149298757314682, "learning_rate": 0.001, "loss": 2.1529, "num_input_tokens_seen": 26472227840, "step": 50500 }, { "epoch": 0.4817725413501556, "eval_loss": 2.0811688899993896, "eval_runtime": 80.0431, "eval_samples_per_second": 62.466, "eval_steps_per_second": 15.617, "num_input_tokens_seen": 26472227840, "step": 50500 }, { "epoch": 0.4822495438663439, "grad_norm": 0.1397143453359604, "learning_rate": 0.001, "loss": 2.1683, "num_input_tokens_seen": 26498441728, "step": 50550 }, { "epoch": 0.48272654638253215, "grad_norm": 0.14634068310260773, "learning_rate": 0.001, "loss": 2.1586, "num_input_tokens_seen": 26524650432, "step": 50600 }, { "epoch": 0.4832035488987204, "grad_norm": 0.15363429486751556, "learning_rate": 0.001, "loss": 2.1711, "num_input_tokens_seen": 26550864832, "step": 50650 }, { "epoch": 0.48368055141490873, "grad_norm": 0.15214493870735168, "learning_rate": 0.001, "loss": 2.1623, "num_input_tokens_seen": 26577068512, "step": 50700 }, { "epoch": 0.484157553931097, "grad_norm": 0.14321520924568176, "learning_rate": 0.001, "loss": 2.1749, "num_input_tokens_seen": 26603282368, "step": 50750 }, { "epoch": 0.48463455644728526, "grad_norm": 0.15269018709659576, "learning_rate": 0.001, "loss": 2.166, "num_input_tokens_seen": 26629493472, "step": 50800 }, { "epoch": 0.4851115589634735, "grad_norm": 0.1434074342250824, "learning_rate": 0.001, "loss": 2.157, "num_input_tokens_seen": 26655707872, "step": 50850 }, { "epoch": 0.4855885614796618, "grad_norm": 0.1321389526128769, "learning_rate": 0.001, "loss": 2.1604, "num_input_tokens_seen": 26681921088, "step": 50900 }, { "epoch": 0.4860655639958501, "grad_norm": 0.1456880420446396, "learning_rate": 0.001, "loss": 2.1602, "num_input_tokens_seen": 26708125664, "step": 50950 }, { "epoch": 0.48654256651203837, "grad_norm": 0.1457262486219406, "learning_rate": 0.001, "loss": 2.169, "num_input_tokens_seen": 26734340064, "step": 51000 }, { "epoch": 0.48654256651203837, "eval_loss": 2.08145809173584, "eval_runtime": 80.0702, "eval_samples_per_second": 62.445, "eval_steps_per_second": 15.611, "num_input_tokens_seen": 26734340064, "step": 51000 }, { "epoch": 0.48701956902822663, "grad_norm": 0.13553930819034576, "learning_rate": 0.001, "loss": 2.1677, "num_input_tokens_seen": 26760554464, "step": 51050 }, { "epoch": 0.4874965715444149, "grad_norm": 0.15178151428699493, "learning_rate": 0.001, "loss": 2.1729, "num_input_tokens_seen": 26786768096, "step": 51100 }, { "epoch": 0.48797357406060315, "grad_norm": 0.14045366644859314, "learning_rate": 0.001, "loss": 2.1808, "num_input_tokens_seen": 26812978944, "step": 51150 }, { "epoch": 0.4884505765767914, "grad_norm": 0.1409856528043747, "learning_rate": 0.001, "loss": 2.1614, "num_input_tokens_seen": 26839186816, "step": 51200 }, { "epoch": 0.48892757909297974, "grad_norm": 0.16543184220790863, "learning_rate": 0.001, "loss": 2.1763, "num_input_tokens_seen": 26865386144, "step": 51250 }, { "epoch": 0.489404581609168, "grad_norm": 0.14048989117145538, "learning_rate": 0.001, "loss": 2.18, "num_input_tokens_seen": 26891600064, "step": 51300 }, { "epoch": 0.48988158412535626, "grad_norm": 0.13343140482902527, "learning_rate": 0.001, "loss": 2.1598, "num_input_tokens_seen": 26917798688, "step": 51350 }, { "epoch": 0.4903585866415445, "grad_norm": 0.1373709738254547, "learning_rate": 0.001, "loss": 2.1646, "num_input_tokens_seen": 26944004160, "step": 51400 }, { "epoch": 0.4908355891577328, "grad_norm": 0.14587919414043427, "learning_rate": 0.001, "loss": 2.1611, "num_input_tokens_seen": 26970218560, "step": 51450 }, { "epoch": 0.4913125916739211, "grad_norm": 0.1413598656654358, "learning_rate": 0.001, "loss": 2.1738, "num_input_tokens_seen": 26996432960, "step": 51500 }, { "epoch": 0.4913125916739211, "eval_loss": 2.0796499252319336, "eval_runtime": 80.2566, "eval_samples_per_second": 62.3, "eval_steps_per_second": 15.575, "num_input_tokens_seen": 26996432960, "step": 51500 }, { "epoch": 0.49178959419010937, "grad_norm": 0.12603874504566193, "learning_rate": 0.001, "loss": 2.1487, "num_input_tokens_seen": 27022644512, "step": 51550 }, { "epoch": 0.49226659670629763, "grad_norm": 0.15144561231136322, "learning_rate": 0.001, "loss": 2.1619, "num_input_tokens_seen": 27048858912, "step": 51600 }, { "epoch": 0.4927435992224859, "grad_norm": 0.15037000179290771, "learning_rate": 0.001, "loss": 2.1639, "num_input_tokens_seen": 27075051872, "step": 51650 }, { "epoch": 0.49322060173867416, "grad_norm": 0.14638635516166687, "learning_rate": 0.001, "loss": 2.1688, "num_input_tokens_seen": 27101261408, "step": 51700 }, { "epoch": 0.4936976042548624, "grad_norm": 0.1408969908952713, "learning_rate": 0.001, "loss": 2.1768, "num_input_tokens_seen": 27127465824, "step": 51750 }, { "epoch": 0.49417460677105074, "grad_norm": 0.15393120050430298, "learning_rate": 0.001, "loss": 2.1719, "num_input_tokens_seen": 27153675296, "step": 51800 }, { "epoch": 0.494651609287239, "grad_norm": 0.13638852536678314, "learning_rate": 0.001, "loss": 2.1553, "num_input_tokens_seen": 27179888192, "step": 51850 }, { "epoch": 0.49512861180342727, "grad_norm": 0.1885574907064438, "learning_rate": 0.001, "loss": 2.1764, "num_input_tokens_seen": 27206102592, "step": 51900 }, { "epoch": 0.49560561431961553, "grad_norm": 0.15341401100158691, "learning_rate": 0.001, "loss": 2.172, "num_input_tokens_seen": 27232311776, "step": 51950 }, { "epoch": 0.4960826168358038, "grad_norm": 0.17189666628837585, "learning_rate": 0.001, "loss": 2.169, "num_input_tokens_seen": 27258524544, "step": 52000 }, { "epoch": 0.4960826168358038, "eval_loss": 2.080231189727783, "eval_runtime": 80.3516, "eval_samples_per_second": 62.227, "eval_steps_per_second": 15.557, "num_input_tokens_seen": 27258524544, "step": 52000 }, { "epoch": 0.49655961935199205, "grad_norm": 0.14001034200191498, "learning_rate": 0.001, "loss": 2.164, "num_input_tokens_seen": 27284732256, "step": 52050 }, { "epoch": 0.4970366218681804, "grad_norm": 0.16195432841777802, "learning_rate": 0.001, "loss": 2.1655, "num_input_tokens_seen": 27310934944, "step": 52100 }, { "epoch": 0.49751362438436864, "grad_norm": 0.14236243069171906, "learning_rate": 0.001, "loss": 2.1629, "num_input_tokens_seen": 27337149344, "step": 52150 }, { "epoch": 0.4979906269005569, "grad_norm": 0.13297200202941895, "learning_rate": 0.001, "loss": 2.1536, "num_input_tokens_seen": 27363363744, "step": 52200 }, { "epoch": 0.49846762941674516, "grad_norm": 0.13531994819641113, "learning_rate": 0.001, "loss": 2.1595, "num_input_tokens_seen": 27389575584, "step": 52250 }, { "epoch": 0.4989446319329334, "grad_norm": 0.13988707959651947, "learning_rate": 0.001, "loss": 2.1727, "num_input_tokens_seen": 27415789984, "step": 52300 }, { "epoch": 0.49942163444912174, "grad_norm": 0.151968851685524, "learning_rate": 0.001, "loss": 2.1668, "num_input_tokens_seen": 27442001344, "step": 52350 }, { "epoch": 0.49989863696531, "grad_norm": 0.12845058739185333, "learning_rate": 0.001, "loss": 2.1681, "num_input_tokens_seen": 27468215456, "step": 52400 }, { "epoch": 0.5003756394814982, "grad_norm": 0.14046697318553925, "learning_rate": 0.001, "loss": 2.1608, "num_input_tokens_seen": 27494423232, "step": 52450 }, { "epoch": 0.5008526419976865, "grad_norm": 0.147069051861763, "learning_rate": 0.001, "loss": 2.1557, "num_input_tokens_seen": 27520636736, "step": 52500 }, { "epoch": 0.5008526419976865, "eval_loss": 2.077626943588257, "eval_runtime": 80.4239, "eval_samples_per_second": 62.171, "eval_steps_per_second": 15.543, "num_input_tokens_seen": 27520636736, "step": 52500 }, { "epoch": 0.5013296445138749, "grad_norm": 0.14614522457122803, "learning_rate": 0.001, "loss": 2.1555, "num_input_tokens_seen": 27546844160, "step": 52550 }, { "epoch": 0.5018066470300631, "grad_norm": 0.15690810978412628, "learning_rate": 0.001, "loss": 2.1626, "num_input_tokens_seen": 27573057792, "step": 52600 }, { "epoch": 0.5022836495462514, "grad_norm": 0.1412731409072876, "learning_rate": 0.001, "loss": 2.1639, "num_input_tokens_seen": 27599266240, "step": 52650 }, { "epoch": 0.5027606520624396, "grad_norm": 0.15567494928836823, "learning_rate": 0.001, "loss": 2.1606, "num_input_tokens_seen": 27625477504, "step": 52700 }, { "epoch": 0.5032376545786279, "grad_norm": 0.13818155229091644, "learning_rate": 0.001, "loss": 2.1655, "num_input_tokens_seen": 27651687680, "step": 52750 }, { "epoch": 0.5037146570948162, "grad_norm": 0.15351204574108124, "learning_rate": 0.001, "loss": 2.1599, "num_input_tokens_seen": 27677893760, "step": 52800 }, { "epoch": 0.5041916596110044, "grad_norm": 0.1560334414243698, "learning_rate": 0.001, "loss": 2.1629, "num_input_tokens_seen": 27704100960, "step": 52850 }, { "epoch": 0.5046686621271927, "grad_norm": 0.1419985294342041, "learning_rate": 0.001, "loss": 2.1697, "num_input_tokens_seen": 27730315136, "step": 52900 }, { "epoch": 0.505145664643381, "grad_norm": 0.16582848131656647, "learning_rate": 0.001, "loss": 2.1592, "num_input_tokens_seen": 27756524992, "step": 52950 }, { "epoch": 0.5056226671595693, "grad_norm": 0.14559602737426758, "learning_rate": 0.001, "loss": 2.1765, "num_input_tokens_seen": 27782732608, "step": 53000 }, { "epoch": 0.5056226671595693, "eval_loss": 2.082807779312134, "eval_runtime": 79.9388, "eval_samples_per_second": 62.548, "eval_steps_per_second": 15.637, "num_input_tokens_seen": 27782732608, "step": 53000 }, { "epoch": 0.5060996696757576, "grad_norm": 0.1368139386177063, "learning_rate": 0.001, "loss": 2.1633, "num_input_tokens_seen": 27808946304, "step": 53050 }, { "epoch": 0.5065766721919458, "grad_norm": 0.13983768224716187, "learning_rate": 0.001, "loss": 2.1664, "num_input_tokens_seen": 27835155744, "step": 53100 }, { "epoch": 0.5070536747081341, "grad_norm": 0.14930413663387299, "learning_rate": 0.001, "loss": 2.1593, "num_input_tokens_seen": 27861367328, "step": 53150 }, { "epoch": 0.5075306772243223, "grad_norm": 0.1432899385690689, "learning_rate": 0.001, "loss": 2.16, "num_input_tokens_seen": 27887572224, "step": 53200 }, { "epoch": 0.5080076797405106, "grad_norm": 0.1435759961605072, "learning_rate": 0.001, "loss": 2.1647, "num_input_tokens_seen": 27913770912, "step": 53250 }, { "epoch": 0.5084846822566989, "grad_norm": 0.14046362042427063, "learning_rate": 0.001, "loss": 2.1631, "num_input_tokens_seen": 27939983648, "step": 53300 }, { "epoch": 0.5089616847728872, "grad_norm": 0.14235271513462067, "learning_rate": 0.001, "loss": 2.1594, "num_input_tokens_seen": 27966198048, "step": 53350 }, { "epoch": 0.5094386872890755, "grad_norm": 0.14583303034305573, "learning_rate": 0.001, "loss": 2.1593, "num_input_tokens_seen": 27992412448, "step": 53400 }, { "epoch": 0.5099156898052637, "grad_norm": 0.1448000818490982, "learning_rate": 0.001, "loss": 2.1634, "num_input_tokens_seen": 28018625792, "step": 53450 }, { "epoch": 0.510392692321452, "grad_norm": 0.1414560228586197, "learning_rate": 0.001, "loss": 2.1616, "num_input_tokens_seen": 28044839456, "step": 53500 }, { "epoch": 0.510392692321452, "eval_loss": 2.076679229736328, "eval_runtime": 80.0345, "eval_samples_per_second": 62.473, "eval_steps_per_second": 15.618, "num_input_tokens_seen": 28044839456, "step": 53500 }, { "epoch": 0.5108696948376402, "grad_norm": 0.13138248026371002, "learning_rate": 0.001, "loss": 2.1597, "num_input_tokens_seen": 28071046464, "step": 53550 }, { "epoch": 0.5113466973538285, "grad_norm": 0.14930233359336853, "learning_rate": 0.001, "loss": 2.1546, "num_input_tokens_seen": 28097258400, "step": 53600 }, { "epoch": 0.5118236998700169, "grad_norm": 0.15363110601902008, "learning_rate": 0.001, "loss": 2.146, "num_input_tokens_seen": 28123467136, "step": 53650 }, { "epoch": 0.5123007023862051, "grad_norm": 0.13691812753677368, "learning_rate": 0.001, "loss": 2.1597, "num_input_tokens_seen": 28149680096, "step": 53700 }, { "epoch": 0.5127777049023934, "grad_norm": 0.14878015220165253, "learning_rate": 0.001, "loss": 2.154, "num_input_tokens_seen": 28175888256, "step": 53750 }, { "epoch": 0.5132547074185816, "grad_norm": 0.1334819197654724, "learning_rate": 0.001, "loss": 2.1518, "num_input_tokens_seen": 28202098624, "step": 53800 }, { "epoch": 0.5137317099347699, "grad_norm": 0.14382654428482056, "learning_rate": 0.001, "loss": 2.1755, "num_input_tokens_seen": 28228306496, "step": 53850 }, { "epoch": 0.5142087124509582, "grad_norm": 0.13571012020111084, "learning_rate": 0.001, "loss": 2.1601, "num_input_tokens_seen": 28254520640, "step": 53900 }, { "epoch": 0.5146857149671464, "grad_norm": 0.13496848940849304, "learning_rate": 0.001, "loss": 2.1635, "num_input_tokens_seen": 28280731968, "step": 53950 }, { "epoch": 0.5151627174833348, "grad_norm": 0.13804535567760468, "learning_rate": 0.001, "loss": 2.1569, "num_input_tokens_seen": 28306946368, "step": 54000 }, { "epoch": 0.5151627174833348, "eval_loss": 2.075817108154297, "eval_runtime": 80.0868, "eval_samples_per_second": 62.432, "eval_steps_per_second": 15.608, "num_input_tokens_seen": 28306946368, "step": 54000 }, { "epoch": 0.515639719999523, "grad_norm": 0.13443215191364288, "learning_rate": 0.001, "loss": 2.1567, "num_input_tokens_seen": 28333154944, "step": 54050 }, { "epoch": 0.5161167225157113, "grad_norm": 0.141039177775383, "learning_rate": 0.001, "loss": 2.1683, "num_input_tokens_seen": 28359365504, "step": 54100 }, { "epoch": 0.5165937250318996, "grad_norm": 0.14250704646110535, "learning_rate": 0.001, "loss": 2.1604, "num_input_tokens_seen": 28385573376, "step": 54150 }, { "epoch": 0.5170707275480878, "grad_norm": 0.14478139579296112, "learning_rate": 0.001, "loss": 2.1557, "num_input_tokens_seen": 28411779968, "step": 54200 }, { "epoch": 0.5175477300642761, "grad_norm": 0.14316383004188538, "learning_rate": 0.001, "loss": 2.1604, "num_input_tokens_seen": 28437988288, "step": 54250 }, { "epoch": 0.5180247325804643, "grad_norm": 0.15016962587833405, "learning_rate": 0.001, "loss": 2.1591, "num_input_tokens_seen": 28464201280, "step": 54300 }, { "epoch": 0.5185017350966526, "grad_norm": 0.1401468962430954, "learning_rate": 0.001, "loss": 2.1483, "num_input_tokens_seen": 28490409024, "step": 54350 }, { "epoch": 0.5189787376128409, "grad_norm": 0.14955569803714752, "learning_rate": 0.001, "loss": 2.163, "num_input_tokens_seen": 28516622176, "step": 54400 }, { "epoch": 0.5194557401290292, "grad_norm": 0.1313570886850357, "learning_rate": 0.001, "loss": 2.1526, "num_input_tokens_seen": 28542833568, "step": 54450 }, { "epoch": 0.5199327426452175, "grad_norm": 0.15107598900794983, "learning_rate": 0.001, "loss": 2.1561, "num_input_tokens_seen": 28569047936, "step": 54500 }, { "epoch": 0.5199327426452175, "eval_loss": 2.074584484100342, "eval_runtime": 80.0388, "eval_samples_per_second": 62.47, "eval_steps_per_second": 15.617, "num_input_tokens_seen": 28569047936, "step": 54500 }, { "epoch": 0.5204097451614057, "grad_norm": 0.14131924510002136, "learning_rate": 0.001, "loss": 2.1566, "num_input_tokens_seen": 28595252544, "step": 54550 }, { "epoch": 0.520886747677594, "grad_norm": 0.14203251898288727, "learning_rate": 0.001, "loss": 2.1472, "num_input_tokens_seen": 28621463264, "step": 54600 }, { "epoch": 0.5213637501937822, "grad_norm": 0.1641647219657898, "learning_rate": 0.001, "loss": 2.1512, "num_input_tokens_seen": 28647667552, "step": 54650 }, { "epoch": 0.5218407527099705, "grad_norm": 0.148850217461586, "learning_rate": 0.001, "loss": 2.1577, "num_input_tokens_seen": 28673875968, "step": 54700 }, { "epoch": 0.5223177552261589, "grad_norm": 0.1406261920928955, "learning_rate": 0.001, "loss": 2.1597, "num_input_tokens_seen": 28700090368, "step": 54750 }, { "epoch": 0.5227947577423471, "grad_norm": 0.1583367884159088, "learning_rate": 0.001, "loss": 2.1712, "num_input_tokens_seen": 28726303904, "step": 54800 }, { "epoch": 0.5232717602585354, "grad_norm": 0.1432129442691803, "learning_rate": 0.001, "loss": 2.1565, "num_input_tokens_seen": 28752517600, "step": 54850 }, { "epoch": 0.5237487627747236, "grad_norm": 0.1418701410293579, "learning_rate": 0.001, "loss": 2.1515, "num_input_tokens_seen": 28778730016, "step": 54900 }, { "epoch": 0.5242257652909119, "grad_norm": 0.14983917772769928, "learning_rate": 0.001, "loss": 2.1529, "num_input_tokens_seen": 28804938496, "step": 54950 }, { "epoch": 0.5247027678071002, "grad_norm": 0.13952849805355072, "learning_rate": 0.001, "loss": 2.1554, "num_input_tokens_seen": 28831152896, "step": 55000 }, { "epoch": 0.5247027678071002, "eval_loss": 2.072495222091675, "eval_runtime": 80.1672, "eval_samples_per_second": 62.37, "eval_steps_per_second": 15.592, "num_input_tokens_seen": 28831152896, "step": 55000 }, { "epoch": 0.5251797703232884, "grad_norm": 0.15341860055923462, "learning_rate": 0.001, "loss": 2.1638, "num_input_tokens_seen": 28857367296, "step": 55050 }, { "epoch": 0.5256567728394768, "grad_norm": 0.1550397276878357, "learning_rate": 0.001, "loss": 2.1591, "num_input_tokens_seen": 28883573728, "step": 55100 }, { "epoch": 0.526133775355665, "grad_norm": 0.13328562676906586, "learning_rate": 0.001, "loss": 2.1647, "num_input_tokens_seen": 28909785952, "step": 55150 }, { "epoch": 0.5266107778718533, "grad_norm": 0.14107167720794678, "learning_rate": 0.001, "loss": 2.1543, "num_input_tokens_seen": 28935993792, "step": 55200 }, { "epoch": 0.5270877803880415, "grad_norm": 0.13323615491390228, "learning_rate": 0.001, "loss": 2.154, "num_input_tokens_seen": 28962207328, "step": 55250 }, { "epoch": 0.5275647829042298, "grad_norm": 0.14103908836841583, "learning_rate": 0.001, "loss": 2.159, "num_input_tokens_seen": 28988419424, "step": 55300 }, { "epoch": 0.5280417854204181, "grad_norm": 0.14379121363162994, "learning_rate": 0.001, "loss": 2.1515, "num_input_tokens_seen": 29014627904, "step": 55350 }, { "epoch": 0.5285187879366063, "grad_norm": 0.14381948113441467, "learning_rate": 0.001, "loss": 2.142, "num_input_tokens_seen": 29040837120, "step": 55400 }, { "epoch": 0.5289957904527947, "grad_norm": 0.13829398155212402, "learning_rate": 0.001, "loss": 2.1573, "num_input_tokens_seen": 29067049824, "step": 55450 }, { "epoch": 0.5294727929689829, "grad_norm": 0.14373236894607544, "learning_rate": 0.001, "loss": 2.1505, "num_input_tokens_seen": 29093257888, "step": 55500 }, { "epoch": 0.5294727929689829, "eval_loss": 2.0716280937194824, "eval_runtime": 80.497, "eval_samples_per_second": 62.114, "eval_steps_per_second": 15.529, "num_input_tokens_seen": 29093257888, "step": 55500 }, { "epoch": 0.5299497954851712, "grad_norm": 0.1435646265745163, "learning_rate": 0.001, "loss": 2.1534, "num_input_tokens_seen": 29119472096, "step": 55550 }, { "epoch": 0.5304267980013595, "grad_norm": 0.15286391973495483, "learning_rate": 0.001, "loss": 2.1538, "num_input_tokens_seen": 29145685600, "step": 55600 }, { "epoch": 0.5309038005175477, "grad_norm": 0.15763621032238007, "learning_rate": 0.001, "loss": 2.1598, "num_input_tokens_seen": 29171894688, "step": 55650 }, { "epoch": 0.531380803033736, "grad_norm": 0.17268946766853333, "learning_rate": 0.001, "loss": 2.1619, "num_input_tokens_seen": 29198109088, "step": 55700 }, { "epoch": 0.5318578055499242, "grad_norm": 0.14589810371398926, "learning_rate": 0.001, "loss": 2.1517, "num_input_tokens_seen": 29224315616, "step": 55750 }, { "epoch": 0.5323348080661126, "grad_norm": 0.14555124938488007, "learning_rate": 0.001, "loss": 2.1583, "num_input_tokens_seen": 29250529568, "step": 55800 }, { "epoch": 0.5328118105823009, "grad_norm": 0.15364859998226166, "learning_rate": 0.001, "loss": 2.1539, "num_input_tokens_seen": 29276735264, "step": 55850 }, { "epoch": 0.5332888130984891, "grad_norm": 0.14615200459957123, "learning_rate": 0.001, "loss": 2.1589, "num_input_tokens_seen": 29302947520, "step": 55900 }, { "epoch": 0.5337658156146774, "grad_norm": 0.13198421895503998, "learning_rate": 0.001, "loss": 2.1608, "num_input_tokens_seen": 29329161920, "step": 55950 }, { "epoch": 0.5342428181308656, "grad_norm": 0.1391836404800415, "learning_rate": 0.001, "loss": 2.1491, "num_input_tokens_seen": 29355372320, "step": 56000 }, { "epoch": 0.5342428181308656, "eval_loss": 2.0713729858398438, "eval_runtime": 80.4159, "eval_samples_per_second": 62.177, "eval_steps_per_second": 15.544, "num_input_tokens_seen": 29355372320, "step": 56000 }, { "epoch": 0.5347198206470539, "grad_norm": 0.14797906577587128, "learning_rate": 0.001, "loss": 2.1577, "num_input_tokens_seen": 29381576544, "step": 56050 }, { "epoch": 0.5351968231632422, "grad_norm": 0.15340618789196014, "learning_rate": 0.001, "loss": 2.1528, "num_input_tokens_seen": 29407783104, "step": 56100 }, { "epoch": 0.5356738256794304, "grad_norm": 0.15017147362232208, "learning_rate": 0.001, "loss": 2.151, "num_input_tokens_seen": 29433993216, "step": 56150 }, { "epoch": 0.5361508281956188, "grad_norm": 0.13791312277317047, "learning_rate": 0.001, "loss": 2.1513, "num_input_tokens_seen": 29460207616, "step": 56200 }, { "epoch": 0.536627830711807, "grad_norm": 0.14975033700466156, "learning_rate": 0.001, "loss": 2.1624, "num_input_tokens_seen": 29486420512, "step": 56250 }, { "epoch": 0.5371048332279953, "grad_norm": 0.1503009796142578, "learning_rate": 0.001, "loss": 2.1662, "num_input_tokens_seen": 29512633952, "step": 56300 }, { "epoch": 0.5375818357441835, "grad_norm": 0.15859892964363098, "learning_rate": 0.001, "loss": 2.1643, "num_input_tokens_seen": 29538847776, "step": 56350 }, { "epoch": 0.5380588382603718, "grad_norm": 0.14404819905757904, "learning_rate": 0.001, "loss": 2.149, "num_input_tokens_seen": 29565059040, "step": 56400 }, { "epoch": 0.5385358407765601, "grad_norm": 0.14447428286075592, "learning_rate": 0.001, "loss": 2.1533, "num_input_tokens_seen": 29591271488, "step": 56450 }, { "epoch": 0.5390128432927483, "grad_norm": 0.1475590616464615, "learning_rate": 0.001, "loss": 2.1471, "num_input_tokens_seen": 29617485024, "step": 56500 }, { "epoch": 0.5390128432927483, "eval_loss": 2.070662498474121, "eval_runtime": 80.5279, "eval_samples_per_second": 62.09, "eval_steps_per_second": 15.523, "num_input_tokens_seen": 29617485024, "step": 56500 }, { "epoch": 0.5394898458089367, "grad_norm": 0.15244145691394806, "learning_rate": 0.001, "loss": 2.1552, "num_input_tokens_seen": 29643697696, "step": 56550 }, { "epoch": 0.5399668483251249, "grad_norm": 0.1519034206867218, "learning_rate": 0.001, "loss": 2.1538, "num_input_tokens_seen": 29669908640, "step": 56600 }, { "epoch": 0.5404438508413132, "grad_norm": 0.14255867898464203, "learning_rate": 0.001, "loss": 2.1503, "num_input_tokens_seen": 29696119200, "step": 56650 }, { "epoch": 0.5409208533575015, "grad_norm": 0.13525427877902985, "learning_rate": 0.001, "loss": 2.161, "num_input_tokens_seen": 29722325888, "step": 56700 }, { "epoch": 0.5413978558736897, "grad_norm": 0.15784476697444916, "learning_rate": 0.001, "loss": 2.1473, "num_input_tokens_seen": 29748536224, "step": 56750 }, { "epoch": 0.541874858389878, "grad_norm": 0.1454872041940689, "learning_rate": 0.001, "loss": 2.1471, "num_input_tokens_seen": 29774749824, "step": 56800 }, { "epoch": 0.5423518609060662, "grad_norm": 0.1350981444120407, "learning_rate": 0.001, "loss": 2.1638, "num_input_tokens_seen": 29800959392, "step": 56850 }, { "epoch": 0.5428288634222546, "grad_norm": 0.13668446242809296, "learning_rate": 0.001, "loss": 2.1497, "num_input_tokens_seen": 29827173792, "step": 56900 }, { "epoch": 0.5433058659384429, "grad_norm": 0.14868319034576416, "learning_rate": 0.001, "loss": 2.1597, "num_input_tokens_seen": 29853388192, "step": 56950 }, { "epoch": 0.5437828684546311, "grad_norm": 0.15703202784061432, "learning_rate": 0.001, "loss": 2.1465, "num_input_tokens_seen": 29879599072, "step": 57000 }, { "epoch": 0.5437828684546311, "eval_loss": 2.0691609382629395, "eval_runtime": 79.8554, "eval_samples_per_second": 62.613, "eval_steps_per_second": 15.653, "num_input_tokens_seen": 29879599072, "step": 57000 }, { "epoch": 0.5442598709708194, "grad_norm": 0.1478765606880188, "learning_rate": 0.001, "loss": 2.1537, "num_input_tokens_seen": 29905809824, "step": 57050 }, { "epoch": 0.5447368734870076, "grad_norm": 0.15318194031715393, "learning_rate": 0.001, "loss": 2.1605, "num_input_tokens_seen": 29932019488, "step": 57100 }, { "epoch": 0.5452138760031959, "grad_norm": 0.14850732684135437, "learning_rate": 0.001, "loss": 2.1562, "num_input_tokens_seen": 29958231520, "step": 57150 }, { "epoch": 0.5456908785193841, "grad_norm": 0.14641685783863068, "learning_rate": 0.001, "loss": 2.1582, "num_input_tokens_seen": 29984433824, "step": 57200 }, { "epoch": 0.5461678810355725, "grad_norm": 0.14056475460529327, "learning_rate": 0.001, "loss": 2.1609, "num_input_tokens_seen": 30010648224, "step": 57250 }, { "epoch": 0.5466448835517608, "grad_norm": 0.1431768536567688, "learning_rate": 0.001, "loss": 2.1562, "num_input_tokens_seen": 30036862624, "step": 57300 }, { "epoch": 0.547121886067949, "grad_norm": 0.13748180866241455, "learning_rate": 0.001, "loss": 2.1489, "num_input_tokens_seen": 30063070240, "step": 57350 }, { "epoch": 0.5475988885841373, "grad_norm": 0.1455860286951065, "learning_rate": 0.001, "loss": 2.1532, "num_input_tokens_seen": 30089281728, "step": 57400 }, { "epoch": 0.5480758911003255, "grad_norm": 0.13956403732299805, "learning_rate": 0.001, "loss": 2.1602, "num_input_tokens_seen": 30115488128, "step": 57450 }, { "epoch": 0.5485528936165138, "grad_norm": 0.13826127350330353, "learning_rate": 0.001, "loss": 2.1511, "num_input_tokens_seen": 30141698752, "step": 57500 }, { "epoch": 0.5485528936165138, "eval_loss": 2.068099021911621, "eval_runtime": 80.8716, "eval_samples_per_second": 61.826, "eval_steps_per_second": 15.457, "num_input_tokens_seen": 30141698752, "step": 57500 }, { "epoch": 0.5490298961327021, "grad_norm": 0.1383499652147293, "learning_rate": 0.001, "loss": 2.1446, "num_input_tokens_seen": 30167909792, "step": 57550 }, { "epoch": 0.5495068986488904, "grad_norm": 0.14953608810901642, "learning_rate": 0.001, "loss": 2.1542, "num_input_tokens_seen": 30194124192, "step": 57600 }, { "epoch": 0.5499839011650787, "grad_norm": 0.15742124617099762, "learning_rate": 0.001, "loss": 2.1323, "num_input_tokens_seen": 30220323232, "step": 57650 }, { "epoch": 0.5504609036812669, "grad_norm": 0.14026963710784912, "learning_rate": 0.001, "loss": 2.1545, "num_input_tokens_seen": 30246537216, "step": 57700 }, { "epoch": 0.5509379061974552, "grad_norm": 0.14743369817733765, "learning_rate": 0.001, "loss": 2.1414, "num_input_tokens_seen": 30272747712, "step": 57750 }, { "epoch": 0.5514149087136435, "grad_norm": 0.13608640432357788, "learning_rate": 0.001, "loss": 2.1511, "num_input_tokens_seen": 30298946976, "step": 57800 }, { "epoch": 0.5518919112298317, "grad_norm": 0.1552729308605194, "learning_rate": 0.001, "loss": 2.1574, "num_input_tokens_seen": 30325148864, "step": 57850 }, { "epoch": 0.55236891374602, "grad_norm": 0.13814964890480042, "learning_rate": 0.001, "loss": 2.1452, "num_input_tokens_seen": 30351363072, "step": 57900 }, { "epoch": 0.5528459162622082, "grad_norm": 0.14916428923606873, "learning_rate": 0.001, "loss": 2.1473, "num_input_tokens_seen": 30377574912, "step": 57950 }, { "epoch": 0.5533229187783966, "grad_norm": 0.14532601833343506, "learning_rate": 0.001, "loss": 2.1456, "num_input_tokens_seen": 30403788864, "step": 58000 }, { "epoch": 0.5533229187783966, "eval_loss": 2.068753480911255, "eval_runtime": 80.9146, "eval_samples_per_second": 61.794, "eval_steps_per_second": 15.448, "num_input_tokens_seen": 30403788864, "step": 58000 }, { "epoch": 0.5537999212945848, "grad_norm": 0.15391622483730316, "learning_rate": 0.001, "loss": 2.1682, "num_input_tokens_seen": 30429999072, "step": 58050 }, { "epoch": 0.5542769238107731, "grad_norm": 0.1502559632062912, "learning_rate": 0.001, "loss": 2.1568, "num_input_tokens_seen": 30456205120, "step": 58100 }, { "epoch": 0.5547539263269614, "grad_norm": 0.13535556197166443, "learning_rate": 0.001, "loss": 2.1493, "num_input_tokens_seen": 30482418400, "step": 58150 }, { "epoch": 0.5552309288431496, "grad_norm": 0.14326569437980652, "learning_rate": 0.001, "loss": 2.1436, "num_input_tokens_seen": 30508626208, "step": 58200 }, { "epoch": 0.5557079313593379, "grad_norm": 0.14152300357818604, "learning_rate": 0.001, "loss": 2.1595, "num_input_tokens_seen": 30534835584, "step": 58250 }, { "epoch": 0.5561849338755261, "grad_norm": 0.14481306076049805, "learning_rate": 0.001, "loss": 2.1551, "num_input_tokens_seen": 30561049984, "step": 58300 }, { "epoch": 0.5566619363917145, "grad_norm": 0.13141630589962006, "learning_rate": 0.001, "loss": 2.1404, "num_input_tokens_seen": 30587264384, "step": 58350 }, { "epoch": 0.5571389389079028, "grad_norm": 0.15466631948947906, "learning_rate": 0.001, "loss": 2.1435, "num_input_tokens_seen": 30613475328, "step": 58400 }, { "epoch": 0.557615941424091, "grad_norm": 0.14728710055351257, "learning_rate": 0.001, "loss": 2.1499, "num_input_tokens_seen": 30639680256, "step": 58450 }, { "epoch": 0.5580929439402793, "grad_norm": 0.14924204349517822, "learning_rate": 0.001, "loss": 2.1591, "num_input_tokens_seen": 30665890560, "step": 58500 }, { "epoch": 0.5580929439402793, "eval_loss": 2.066398859024048, "eval_runtime": 80.4535, "eval_samples_per_second": 62.148, "eval_steps_per_second": 15.537, "num_input_tokens_seen": 30665890560, "step": 58500 }, { "epoch": 0.5585699464564675, "grad_norm": 0.1510065197944641, "learning_rate": 0.001, "loss": 2.1457, "num_input_tokens_seen": 30692103392, "step": 58550 }, { "epoch": 0.5590469489726558, "grad_norm": 0.13719449937343597, "learning_rate": 0.001, "loss": 2.1502, "num_input_tokens_seen": 30718316864, "step": 58600 }, { "epoch": 0.5595239514888442, "grad_norm": 0.1269613802433014, "learning_rate": 0.001, "loss": 2.1563, "num_input_tokens_seen": 30744530464, "step": 58650 }, { "epoch": 0.5600009540050324, "grad_norm": 0.15356452763080597, "learning_rate": 0.001, "loss": 2.1496, "num_input_tokens_seen": 30770743616, "step": 58700 }, { "epoch": 0.5604779565212207, "grad_norm": 0.1365087777376175, "learning_rate": 0.001, "loss": 2.1601, "num_input_tokens_seen": 30796958016, "step": 58750 }, { "epoch": 0.5609549590374089, "grad_norm": 0.15105237066745758, "learning_rate": 0.001, "loss": 2.1504, "num_input_tokens_seen": 30823166336, "step": 58800 }, { "epoch": 0.5614319615535972, "grad_norm": 0.15393078327178955, "learning_rate": 0.001, "loss": 2.1523, "num_input_tokens_seen": 30849379456, "step": 58850 }, { "epoch": 0.5619089640697855, "grad_norm": 0.15258848667144775, "learning_rate": 0.001, "loss": 2.1679, "num_input_tokens_seen": 30875585280, "step": 58900 }, { "epoch": 0.5623859665859737, "grad_norm": 0.14167654514312744, "learning_rate": 0.001, "loss": 2.1626, "num_input_tokens_seen": 30901791328, "step": 58950 }, { "epoch": 0.562862969102162, "grad_norm": 0.1519978791475296, "learning_rate": 0.001, "loss": 2.1508, "num_input_tokens_seen": 30927998464, "step": 59000 }, { "epoch": 0.562862969102162, "eval_loss": 2.067110300064087, "eval_runtime": 80.044, "eval_samples_per_second": 62.466, "eval_steps_per_second": 15.616, "num_input_tokens_seen": 30927998464, "step": 59000 }, { "epoch": 0.5633399716183503, "grad_norm": 0.1528017818927765, "learning_rate": 0.001, "loss": 2.1535, "num_input_tokens_seen": 30954208032, "step": 59050 }, { "epoch": 0.5638169741345386, "grad_norm": 0.13762027025222778, "learning_rate": 0.001, "loss": 2.1442, "num_input_tokens_seen": 30980422432, "step": 59100 }, { "epoch": 0.5642939766507268, "grad_norm": 0.15064965188503265, "learning_rate": 0.001, "loss": 2.1572, "num_input_tokens_seen": 31006636832, "step": 59150 }, { "epoch": 0.5647709791669151, "grad_norm": 0.14274545013904572, "learning_rate": 0.001, "loss": 2.1514, "num_input_tokens_seen": 31032842400, "step": 59200 }, { "epoch": 0.5652479816831034, "grad_norm": 0.15505096316337585, "learning_rate": 0.001, "loss": 2.1544, "num_input_tokens_seen": 31059054240, "step": 59250 }, { "epoch": 0.5657249841992916, "grad_norm": 0.1395845264196396, "learning_rate": 0.001, "loss": 2.1578, "num_input_tokens_seen": 31085264224, "step": 59300 }, { "epoch": 0.5662019867154799, "grad_norm": 0.14424628019332886, "learning_rate": 0.001, "loss": 2.1521, "num_input_tokens_seen": 31111477504, "step": 59350 }, { "epoch": 0.5666789892316682, "grad_norm": 0.14009548723697662, "learning_rate": 0.001, "loss": 2.1393, "num_input_tokens_seen": 31137691904, "step": 59400 }, { "epoch": 0.5671559917478565, "grad_norm": 0.1312466412782669, "learning_rate": 0.001, "loss": 2.1489, "num_input_tokens_seen": 31163906304, "step": 59450 }, { "epoch": 0.5676329942640448, "grad_norm": 0.14498138427734375, "learning_rate": 0.001, "loss": 2.1466, "num_input_tokens_seen": 31190116608, "step": 59500 }, { "epoch": 0.5676329942640448, "eval_loss": 2.06640362739563, "eval_runtime": 80.0487, "eval_samples_per_second": 62.462, "eval_steps_per_second": 15.615, "num_input_tokens_seen": 31190116608, "step": 59500 }, { "epoch": 0.568109996780233, "grad_norm": 0.13518276810646057, "learning_rate": 0.001, "loss": 2.1462, "num_input_tokens_seen": 31216326496, "step": 59550 }, { "epoch": 0.5685869992964213, "grad_norm": 0.14145778119564056, "learning_rate": 0.001, "loss": 2.1474, "num_input_tokens_seen": 31242536256, "step": 59600 }, { "epoch": 0.5690640018126095, "grad_norm": 0.14637036621570587, "learning_rate": 0.001, "loss": 2.1475, "num_input_tokens_seen": 31268747936, "step": 59650 }, { "epoch": 0.5695410043287978, "grad_norm": 0.14279405772686005, "learning_rate": 0.001, "loss": 2.1417, "num_input_tokens_seen": 31294956608, "step": 59700 }, { "epoch": 0.5700180068449862, "grad_norm": 0.14410801231861115, "learning_rate": 0.001, "loss": 2.1456, "num_input_tokens_seen": 31321167264, "step": 59750 }, { "epoch": 0.5704950093611744, "grad_norm": 0.14293836057186127, "learning_rate": 0.001, "loss": 2.1475, "num_input_tokens_seen": 31347378656, "step": 59800 }, { "epoch": 0.5709720118773627, "grad_norm": 0.15702761709690094, "learning_rate": 0.001, "loss": 2.1488, "num_input_tokens_seen": 31373586176, "step": 59850 }, { "epoch": 0.5714490143935509, "grad_norm": 0.15636380016803741, "learning_rate": 0.001, "loss": 2.1426, "num_input_tokens_seen": 31399797216, "step": 59900 }, { "epoch": 0.5719260169097392, "grad_norm": 0.13920319080352783, "learning_rate": 0.001, "loss": 2.148, "num_input_tokens_seen": 31426011616, "step": 59950 }, { "epoch": 0.5724030194259274, "grad_norm": 0.14815059304237366, "learning_rate": 0.001, "loss": 2.1457, "num_input_tokens_seen": 31452217632, "step": 60000 }, { "epoch": 0.5724030194259274, "eval_loss": 2.064025402069092, "eval_runtime": 79.9516, "eval_samples_per_second": 62.538, "eval_steps_per_second": 15.634, "num_input_tokens_seen": 31452217632, "step": 60000 }, { "epoch": 0.5728800219421157, "grad_norm": 0.15377280116081238, "learning_rate": 0.001, "loss": 2.1499, "num_input_tokens_seen": 31478418880, "step": 60050 }, { "epoch": 0.573357024458304, "grad_norm": 0.16740241646766663, "learning_rate": 0.001, "loss": 2.149, "num_input_tokens_seen": 31504632704, "step": 60100 }, { "epoch": 0.5738340269744923, "grad_norm": 0.1350049525499344, "learning_rate": 0.001, "loss": 2.1432, "num_input_tokens_seen": 31530844128, "step": 60150 }, { "epoch": 0.5743110294906806, "grad_norm": 0.145762100815773, "learning_rate": 0.001, "loss": 2.1492, "num_input_tokens_seen": 31557058528, "step": 60200 }, { "epoch": 0.5747880320068688, "grad_norm": 0.1441580355167389, "learning_rate": 0.001, "loss": 2.1417, "num_input_tokens_seen": 31583268320, "step": 60250 }, { "epoch": 0.5752650345230571, "grad_norm": 0.153322234749794, "learning_rate": 0.001, "loss": 2.1508, "num_input_tokens_seen": 31609470656, "step": 60300 }, { "epoch": 0.5757420370392454, "grad_norm": 0.14399965107440948, "learning_rate": 0.001, "loss": 2.1421, "num_input_tokens_seen": 31635684704, "step": 60350 }, { "epoch": 0.5762190395554336, "grad_norm": 0.13685567677021027, "learning_rate": 0.001, "loss": 2.1406, "num_input_tokens_seen": 31661895968, "step": 60400 }, { "epoch": 0.576696042071622, "grad_norm": 0.21189793944358826, "learning_rate": 0.001, "loss": 2.1565, "num_input_tokens_seen": 31688104736, "step": 60450 }, { "epoch": 0.5771730445878102, "grad_norm": 0.13776901364326477, "learning_rate": 0.001, "loss": 2.1496, "num_input_tokens_seen": 31714314848, "step": 60500 }, { "epoch": 0.5771730445878102, "eval_loss": 2.063593864440918, "eval_runtime": 80.1547, "eval_samples_per_second": 62.379, "eval_steps_per_second": 15.595, "num_input_tokens_seen": 31714314848, "step": 60500 }, { "epoch": 0.5776500471039985, "grad_norm": 0.1537674516439438, "learning_rate": 0.001, "loss": 2.1439, "num_input_tokens_seen": 31740523712, "step": 60550 }, { "epoch": 0.5781270496201868, "grad_norm": 0.1462978571653366, "learning_rate": 0.001, "loss": 2.1539, "num_input_tokens_seen": 31766736480, "step": 60600 }, { "epoch": 0.578604052136375, "grad_norm": 0.14669708907604218, "learning_rate": 0.001, "loss": 2.1576, "num_input_tokens_seen": 31792947904, "step": 60650 }, { "epoch": 0.5790810546525633, "grad_norm": 0.1519545167684555, "learning_rate": 0.001, "loss": 2.1453, "num_input_tokens_seen": 31819160672, "step": 60700 }, { "epoch": 0.5795580571687515, "grad_norm": 0.1466340571641922, "learning_rate": 0.001, "loss": 2.1434, "num_input_tokens_seen": 31845374816, "step": 60750 }, { "epoch": 0.5800350596849398, "grad_norm": 0.13935734331607819, "learning_rate": 0.001, "loss": 2.1423, "num_input_tokens_seen": 31871587232, "step": 60800 }, { "epoch": 0.5805120622011282, "grad_norm": 0.14017197489738464, "learning_rate": 0.001, "loss": 2.14, "num_input_tokens_seen": 31897801280, "step": 60850 }, { "epoch": 0.5809890647173164, "grad_norm": 0.14253723621368408, "learning_rate": 0.001, "loss": 2.1387, "num_input_tokens_seen": 31924014720, "step": 60900 }, { "epoch": 0.5814660672335047, "grad_norm": 0.15480197966098785, "learning_rate": 0.001, "loss": 2.1647, "num_input_tokens_seen": 31950220448, "step": 60950 }, { "epoch": 0.5819430697496929, "grad_norm": 0.1502438485622406, "learning_rate": 0.001, "loss": 2.1418, "num_input_tokens_seen": 31976431072, "step": 61000 }, { "epoch": 0.5819430697496929, "eval_loss": 2.0649499893188477, "eval_runtime": 80.1357, "eval_samples_per_second": 62.394, "eval_steps_per_second": 15.599, "num_input_tokens_seen": 31976431072, "step": 61000 }, { "epoch": 0.5824200722658812, "grad_norm": 0.14360016584396362, "learning_rate": 0.001, "loss": 2.1464, "num_input_tokens_seen": 32002645472, "step": 61050 }, { "epoch": 0.5828970747820694, "grad_norm": 0.1369880586862564, "learning_rate": 0.001, "loss": 2.1556, "num_input_tokens_seen": 32028853216, "step": 61100 }, { "epoch": 0.5833740772982577, "grad_norm": 0.1452026516199112, "learning_rate": 0.001, "loss": 2.1354, "num_input_tokens_seen": 32055062080, "step": 61150 }, { "epoch": 0.5838510798144461, "grad_norm": 0.14710959792137146, "learning_rate": 0.001, "loss": 2.1472, "num_input_tokens_seen": 32081272192, "step": 61200 }, { "epoch": 0.5843280823306343, "grad_norm": 0.14364252984523773, "learning_rate": 0.001, "loss": 2.1562, "num_input_tokens_seen": 32107485248, "step": 61250 }, { "epoch": 0.5848050848468226, "grad_norm": 0.1464463770389557, "learning_rate": 0.001, "loss": 2.1455, "num_input_tokens_seen": 32133699648, "step": 61300 }, { "epoch": 0.5852820873630108, "grad_norm": 0.13232292234897614, "learning_rate": 0.001, "loss": 2.1515, "num_input_tokens_seen": 32159909088, "step": 61350 }, { "epoch": 0.5857590898791991, "grad_norm": 0.14374487102031708, "learning_rate": 0.001, "loss": 2.1447, "num_input_tokens_seen": 32186118336, "step": 61400 }, { "epoch": 0.5862360923953874, "grad_norm": 0.13590660691261292, "learning_rate": 0.001, "loss": 2.1431, "num_input_tokens_seen": 32212322976, "step": 61450 }, { "epoch": 0.5867130949115756, "grad_norm": 0.14497828483581543, "learning_rate": 0.001, "loss": 2.1477, "num_input_tokens_seen": 32238532768, "step": 61500 }, { "epoch": 0.5867130949115756, "eval_loss": 2.0637688636779785, "eval_runtime": 80.1948, "eval_samples_per_second": 62.348, "eval_steps_per_second": 15.587, "num_input_tokens_seen": 32238532768, "step": 61500 }, { "epoch": 0.587190097427764, "grad_norm": 0.15234364569187164, "learning_rate": 0.001, "loss": 2.1518, "num_input_tokens_seen": 32264747168, "step": 61550 }, { "epoch": 0.5876670999439522, "grad_norm": 0.15645268559455872, "learning_rate": 0.001, "loss": 2.1417, "num_input_tokens_seen": 32290958592, "step": 61600 }, { "epoch": 0.5881441024601405, "grad_norm": 0.13095822930335999, "learning_rate": 0.001, "loss": 2.1556, "num_input_tokens_seen": 32317172864, "step": 61650 }, { "epoch": 0.5886211049763288, "grad_norm": 0.14591479301452637, "learning_rate": 0.001, "loss": 2.1427, "num_input_tokens_seen": 32343387264, "step": 61700 }, { "epoch": 0.589098107492517, "grad_norm": 0.14499343931674957, "learning_rate": 0.001, "loss": 2.1476, "num_input_tokens_seen": 32369597760, "step": 61750 }, { "epoch": 0.5895751100087053, "grad_norm": 0.1538584977388382, "learning_rate": 0.001, "loss": 2.1479, "num_input_tokens_seen": 32395811904, "step": 61800 }, { "epoch": 0.5900521125248935, "grad_norm": 0.14564937353134155, "learning_rate": 0.001, "loss": 2.1369, "num_input_tokens_seen": 32422005088, "step": 61850 }, { "epoch": 0.5905291150410819, "grad_norm": 0.15571440756320953, "learning_rate": 0.001, "loss": 2.1452, "num_input_tokens_seen": 32448215552, "step": 61900 }, { "epoch": 0.5910061175572701, "grad_norm": 0.15152624249458313, "learning_rate": 0.001, "loss": 2.1501, "num_input_tokens_seen": 32474410528, "step": 61950 }, { "epoch": 0.5914831200734584, "grad_norm": 0.14020481705665588, "learning_rate": 0.001, "loss": 2.137, "num_input_tokens_seen": 32500617568, "step": 62000 }, { "epoch": 0.5914831200734584, "eval_loss": 2.0610291957855225, "eval_runtime": 80.4672, "eval_samples_per_second": 62.137, "eval_steps_per_second": 15.534, "num_input_tokens_seen": 32500617568, "step": 62000 }, { "epoch": 0.5919601225896467, "grad_norm": 0.13953524827957153, "learning_rate": 0.001, "loss": 2.1443, "num_input_tokens_seen": 32526826016, "step": 62050 }, { "epoch": 0.5924371251058349, "grad_norm": 0.13924415409564972, "learning_rate": 0.001, "loss": 2.1439, "num_input_tokens_seen": 32553032032, "step": 62100 }, { "epoch": 0.5929141276220232, "grad_norm": 0.15134859085083008, "learning_rate": 0.001, "loss": 2.1457, "num_input_tokens_seen": 32579225888, "step": 62150 }, { "epoch": 0.5933911301382114, "grad_norm": 0.14563137292861938, "learning_rate": 0.001, "loss": 2.1426, "num_input_tokens_seen": 32605435776, "step": 62200 }, { "epoch": 0.5938681326543997, "grad_norm": 0.14072252810001373, "learning_rate": 0.001, "loss": 2.1561, "num_input_tokens_seen": 32631646944, "step": 62250 }, { "epoch": 0.5943451351705881, "grad_norm": 0.1321408897638321, "learning_rate": 0.001, "loss": 2.1328, "num_input_tokens_seen": 32657860736, "step": 62300 }, { "epoch": 0.5948221376867763, "grad_norm": 0.13987164199352264, "learning_rate": 0.001, "loss": 2.1409, "num_input_tokens_seen": 32684073568, "step": 62350 }, { "epoch": 0.5952991402029646, "grad_norm": 0.14474605023860931, "learning_rate": 0.001, "loss": 2.1499, "num_input_tokens_seen": 32710284704, "step": 62400 }, { "epoch": 0.5957761427191528, "grad_norm": 0.14219556748867035, "learning_rate": 0.001, "loss": 2.1514, "num_input_tokens_seen": 32736491200, "step": 62450 }, { "epoch": 0.5962531452353411, "grad_norm": 0.15876619517803192, "learning_rate": 0.001, "loss": 2.1415, "num_input_tokens_seen": 32762704928, "step": 62500 }, { "epoch": 0.5962531452353411, "eval_loss": 2.0606014728546143, "eval_runtime": 80.5848, "eval_samples_per_second": 62.046, "eval_steps_per_second": 15.512, "num_input_tokens_seen": 32762704928, "step": 62500 }, { "epoch": 0.5967301477515294, "grad_norm": 0.14429977536201477, "learning_rate": 0.001, "loss": 2.1462, "num_input_tokens_seen": 32788919328, "step": 62550 }, { "epoch": 0.5972071502677176, "grad_norm": 0.1436418741941452, "learning_rate": 0.001, "loss": 2.1495, "num_input_tokens_seen": 32815127776, "step": 62600 }, { "epoch": 0.597684152783906, "grad_norm": 0.14079809188842773, "learning_rate": 0.001, "loss": 2.1478, "num_input_tokens_seen": 32841342144, "step": 62650 }, { "epoch": 0.5981611553000942, "grad_norm": 0.13799606263637543, "learning_rate": 0.001, "loss": 2.1487, "num_input_tokens_seen": 32867549472, "step": 62700 }, { "epoch": 0.5986381578162825, "grad_norm": 0.13869820535182953, "learning_rate": 0.001, "loss": 2.1379, "num_input_tokens_seen": 32893759680, "step": 62750 }, { "epoch": 0.5991151603324708, "grad_norm": 0.1471201330423355, "learning_rate": 0.001, "loss": 2.1573, "num_input_tokens_seen": 32919968544, "step": 62800 }, { "epoch": 0.599592162848659, "grad_norm": 0.14617429673671722, "learning_rate": 0.001, "loss": 2.1338, "num_input_tokens_seen": 32946179008, "step": 62850 }, { "epoch": 0.6000691653648473, "grad_norm": 0.14857004582881927, "learning_rate": 0.001, "loss": 2.1417, "num_input_tokens_seen": 32972393408, "step": 62900 }, { "epoch": 0.6005461678810355, "grad_norm": 0.1417587846517563, "learning_rate": 0.001, "loss": 2.1414, "num_input_tokens_seen": 32998607808, "step": 62950 }, { "epoch": 0.6010231703972239, "grad_norm": 0.13704361021518707, "learning_rate": 0.001, "loss": 2.1459, "num_input_tokens_seen": 33024820736, "step": 63000 }, { "epoch": 0.6010231703972239, "eval_loss": 2.0603103637695312, "eval_runtime": 79.9162, "eval_samples_per_second": 62.566, "eval_steps_per_second": 15.641, "num_input_tokens_seen": 33024820736, "step": 63000 }, { "epoch": 0.6015001729134121, "grad_norm": 0.13333925604820251, "learning_rate": 0.001, "loss": 2.1431, "num_input_tokens_seen": 33051029952, "step": 63050 }, { "epoch": 0.6019771754296004, "grad_norm": 0.1660204976797104, "learning_rate": 0.001, "loss": 2.1439, "num_input_tokens_seen": 33077244352, "step": 63100 }, { "epoch": 0.6024541779457887, "grad_norm": 0.14403057098388672, "learning_rate": 0.001, "loss": 2.1485, "num_input_tokens_seen": 33103448480, "step": 63150 }, { "epoch": 0.6029311804619769, "grad_norm": 0.13897418975830078, "learning_rate": 0.001, "loss": 2.1481, "num_input_tokens_seen": 33129662848, "step": 63200 }, { "epoch": 0.6034081829781652, "grad_norm": 0.1460401862859726, "learning_rate": 0.001, "loss": 2.153, "num_input_tokens_seen": 33155874816, "step": 63250 }, { "epoch": 0.6038851854943534, "grad_norm": 0.1433630883693695, "learning_rate": 0.001, "loss": 2.1408, "num_input_tokens_seen": 33182089024, "step": 63300 }, { "epoch": 0.6043621880105418, "grad_norm": 0.1596335917711258, "learning_rate": 0.001, "loss": 2.1493, "num_input_tokens_seen": 33208303136, "step": 63350 }, { "epoch": 0.6048391905267301, "grad_norm": 0.1354464888572693, "learning_rate": 0.001, "loss": 2.1537, "num_input_tokens_seen": 33234511872, "step": 63400 }, { "epoch": 0.6053161930429183, "grad_norm": 0.14532111585140228, "learning_rate": 0.001, "loss": 2.1365, "num_input_tokens_seen": 33260724384, "step": 63450 }, { "epoch": 0.6057931955591066, "grad_norm": 0.14670905470848083, "learning_rate": 0.001, "loss": 2.1389, "num_input_tokens_seen": 33286935872, "step": 63500 }, { "epoch": 0.6057931955591066, "eval_loss": 2.058581590652466, "eval_runtime": 79.4802, "eval_samples_per_second": 62.909, "eval_steps_per_second": 15.727, "num_input_tokens_seen": 33286935872, "step": 63500 }, { "epoch": 0.6062701980752948, "grad_norm": 0.1628328263759613, "learning_rate": 0.001, "loss": 2.1427, "num_input_tokens_seen": 33313150272, "step": 63550 }, { "epoch": 0.6067472005914831, "grad_norm": 0.14063307642936707, "learning_rate": 0.001, "loss": 2.1457, "num_input_tokens_seen": 33339359200, "step": 63600 }, { "epoch": 0.6072242031076714, "grad_norm": 0.1543467491865158, "learning_rate": 0.001, "loss": 2.1416, "num_input_tokens_seen": 33365572768, "step": 63650 }, { "epoch": 0.6077012056238597, "grad_norm": 0.13870377838611603, "learning_rate": 0.001, "loss": 2.1471, "num_input_tokens_seen": 33391785248, "step": 63700 }, { "epoch": 0.608178208140048, "grad_norm": 0.15191654860973358, "learning_rate": 0.001, "loss": 2.137, "num_input_tokens_seen": 33417994304, "step": 63750 }, { "epoch": 0.6086552106562362, "grad_norm": 0.15023675560951233, "learning_rate": 0.001, "loss": 2.1513, "num_input_tokens_seen": 33444202144, "step": 63800 }, { "epoch": 0.6091322131724245, "grad_norm": 0.14828945696353912, "learning_rate": 0.001, "loss": 2.1359, "num_input_tokens_seen": 33470406880, "step": 63850 }, { "epoch": 0.6096092156886127, "grad_norm": 0.1423346996307373, "learning_rate": 0.001, "loss": 2.121, "num_input_tokens_seen": 33496614304, "step": 63900 }, { "epoch": 0.610086218204801, "grad_norm": 0.1419682651758194, "learning_rate": 0.001, "loss": 2.1475, "num_input_tokens_seen": 33522828352, "step": 63950 }, { "epoch": 0.6105632207209893, "grad_norm": 0.14732129871845245, "learning_rate": 0.001, "loss": 2.1367, "num_input_tokens_seen": 33549034848, "step": 64000 }, { "epoch": 0.6105632207209893, "eval_loss": 2.0587704181671143, "eval_runtime": 80.0375, "eval_samples_per_second": 62.471, "eval_steps_per_second": 15.618, "num_input_tokens_seen": 33549034848, "step": 64000 }, { "epoch": 0.6110402232371775, "grad_norm": 0.1425338238477707, "learning_rate": 0.001, "loss": 2.1458, "num_input_tokens_seen": 33575249248, "step": 64050 }, { "epoch": 0.6115172257533659, "grad_norm": 0.16987043619155884, "learning_rate": 0.001, "loss": 2.1534, "num_input_tokens_seen": 33601461568, "step": 64100 }, { "epoch": 0.6119942282695541, "grad_norm": 0.1452094316482544, "learning_rate": 0.001, "loss": 2.139, "num_input_tokens_seen": 33627675392, "step": 64150 }, { "epoch": 0.6124712307857424, "grad_norm": 0.14077788591384888, "learning_rate": 0.001, "loss": 2.1396, "num_input_tokens_seen": 33653881632, "step": 64200 }, { "epoch": 0.6129482333019307, "grad_norm": 0.14799240231513977, "learning_rate": 0.001, "loss": 2.1837, "num_input_tokens_seen": 33680096032, "step": 64250 }, { "epoch": 0.6134252358181189, "grad_norm": 0.14731177687644958, "learning_rate": 0.001, "loss": 2.1631, "num_input_tokens_seen": 33706309408, "step": 64300 }, { "epoch": 0.6139022383343072, "grad_norm": 0.14560987055301666, "learning_rate": 0.001, "loss": 2.1464, "num_input_tokens_seen": 33732520736, "step": 64350 }, { "epoch": 0.6143792408504954, "grad_norm": 0.15761514008045197, "learning_rate": 0.001, "loss": 2.1485, "num_input_tokens_seen": 33758730752, "step": 64400 }, { "epoch": 0.6148562433666838, "grad_norm": 0.14389316737651825, "learning_rate": 0.001, "loss": 2.1406, "num_input_tokens_seen": 33784943872, "step": 64450 }, { "epoch": 0.6153332458828721, "grad_norm": 0.12751001119613647, "learning_rate": 0.001, "loss": 2.147, "num_input_tokens_seen": 33811149696, "step": 64500 }, { "epoch": 0.6153332458828721, "eval_loss": 2.059323787689209, "eval_runtime": 80.4929, "eval_samples_per_second": 62.117, "eval_steps_per_second": 15.529, "num_input_tokens_seen": 33811149696, "step": 64500 }, { "epoch": 0.6158102483990603, "grad_norm": 0.1340908706188202, "learning_rate": 0.001, "loss": 2.1525, "num_input_tokens_seen": 33837360224, "step": 64550 }, { "epoch": 0.6162872509152486, "grad_norm": 0.1480954885482788, "learning_rate": 0.001, "loss": 2.1394, "num_input_tokens_seen": 33863574624, "step": 64600 }, { "epoch": 0.6167642534314368, "grad_norm": 0.14949338138103485, "learning_rate": 0.001, "loss": 2.1289, "num_input_tokens_seen": 33889784992, "step": 64650 }, { "epoch": 0.6172412559476251, "grad_norm": 0.13703715801239014, "learning_rate": 0.001, "loss": 2.1479, "num_input_tokens_seen": 33915999392, "step": 64700 }, { "epoch": 0.6177182584638133, "grad_norm": 0.1442926824092865, "learning_rate": 0.001, "loss": 2.1409, "num_input_tokens_seen": 33942206240, "step": 64750 }, { "epoch": 0.6181952609800017, "grad_norm": 0.12989823520183563, "learning_rate": 0.001, "loss": 2.1306, "num_input_tokens_seen": 33968419968, "step": 64800 }, { "epoch": 0.61867226349619, "grad_norm": 0.14975020289421082, "learning_rate": 0.001, "loss": 2.1391, "num_input_tokens_seen": 33994629952, "step": 64850 }, { "epoch": 0.6191492660123782, "grad_norm": 0.1355099231004715, "learning_rate": 0.001, "loss": 2.1449, "num_input_tokens_seen": 34020843104, "step": 64900 }, { "epoch": 0.6196262685285665, "grad_norm": 0.13653282821178436, "learning_rate": 0.001, "loss": 2.1398, "num_input_tokens_seen": 34047052768, "step": 64950 }, { "epoch": 0.6201032710447547, "grad_norm": 0.1443834900856018, "learning_rate": 0.001, "loss": 2.1415, "num_input_tokens_seen": 34073267168, "step": 65000 }, { "epoch": 0.6201032710447547, "eval_loss": 2.0579845905303955, "eval_runtime": 79.8036, "eval_samples_per_second": 62.654, "eval_steps_per_second": 15.663, "num_input_tokens_seen": 34073267168, "step": 65000 }, { "epoch": 0.620580273560943, "grad_norm": 0.13972166180610657, "learning_rate": 0.001, "loss": 2.1469, "num_input_tokens_seen": 34099472608, "step": 65050 }, { "epoch": 0.6210572760771313, "grad_norm": 0.15001018345355988, "learning_rate": 0.001, "loss": 2.1441, "num_input_tokens_seen": 34125685056, "step": 65100 }, { "epoch": 0.6215342785933196, "grad_norm": 0.13544200360774994, "learning_rate": 0.001, "loss": 2.1468, "num_input_tokens_seen": 34151895488, "step": 65150 }, { "epoch": 0.6220112811095079, "grad_norm": 0.13995452225208282, "learning_rate": 0.001, "loss": 2.1344, "num_input_tokens_seen": 34178106848, "step": 65200 }, { "epoch": 0.6224882836256961, "grad_norm": 0.14656807482242584, "learning_rate": 0.001, "loss": 2.1354, "num_input_tokens_seen": 34204315008, "step": 65250 }, { "epoch": 0.6229652861418844, "grad_norm": 0.14047770202159882, "learning_rate": 0.001, "loss": 2.1484, "num_input_tokens_seen": 34230529184, "step": 65300 }, { "epoch": 0.6234422886580727, "grad_norm": 0.1519668698310852, "learning_rate": 0.001, "loss": 2.146, "num_input_tokens_seen": 34256735776, "step": 65350 }, { "epoch": 0.6239192911742609, "grad_norm": 0.15893806517124176, "learning_rate": 0.001, "loss": 2.1439, "num_input_tokens_seen": 34282942784, "step": 65400 }, { "epoch": 0.6243962936904492, "grad_norm": 0.15153321623802185, "learning_rate": 0.001, "loss": 2.1423, "num_input_tokens_seen": 34309151360, "step": 65450 }, { "epoch": 0.6248732962066375, "grad_norm": 0.13959857821464539, "learning_rate": 0.001, "loss": 2.1426, "num_input_tokens_seen": 34335361632, "step": 65500 }, { "epoch": 0.6248732962066375, "eval_loss": 2.056854724884033, "eval_runtime": 80.3509, "eval_samples_per_second": 62.227, "eval_steps_per_second": 15.557, "num_input_tokens_seen": 34335361632, "step": 65500 }, { "epoch": 0.6253502987228258, "grad_norm": 0.1324777901172638, "learning_rate": 0.001, "loss": 2.1397, "num_input_tokens_seen": 34361575424, "step": 65550 }, { "epoch": 0.6258273012390141, "grad_norm": 0.14406649768352509, "learning_rate": 0.001, "loss": 2.1469, "num_input_tokens_seen": 34387789824, "step": 65600 }, { "epoch": 0.6263043037552023, "grad_norm": 0.14909878373146057, "learning_rate": 0.001, "loss": 2.1485, "num_input_tokens_seen": 34414002464, "step": 65650 }, { "epoch": 0.6267813062713906, "grad_norm": 0.14565107226371765, "learning_rate": 0.001, "loss": 2.1459, "num_input_tokens_seen": 34440213152, "step": 65700 }, { "epoch": 0.6272583087875788, "grad_norm": 0.1643950343132019, "learning_rate": 0.001, "loss": 2.1311, "num_input_tokens_seen": 34466427104, "step": 65750 }, { "epoch": 0.6277353113037671, "grad_norm": 0.14298604428768158, "learning_rate": 0.001, "loss": 2.1452, "num_input_tokens_seen": 34492641472, "step": 65800 }, { "epoch": 0.6282123138199553, "grad_norm": 0.1350494772195816, "learning_rate": 0.001, "loss": 2.1349, "num_input_tokens_seen": 34518851072, "step": 65850 }, { "epoch": 0.6286893163361437, "grad_norm": 0.15055470168590546, "learning_rate": 0.001, "loss": 2.1446, "num_input_tokens_seen": 34545063136, "step": 65900 }, { "epoch": 0.629166318852332, "grad_norm": 0.15200093388557434, "learning_rate": 0.001, "loss": 2.1377, "num_input_tokens_seen": 34571258560, "step": 65950 }, { "epoch": 0.6296433213685202, "grad_norm": 0.1419169306755066, "learning_rate": 0.001, "loss": 2.1483, "num_input_tokens_seen": 34597457472, "step": 66000 }, { "epoch": 0.6296433213685202, "eval_loss": 2.0544610023498535, "eval_runtime": 80.091, "eval_samples_per_second": 62.429, "eval_steps_per_second": 15.607, "num_input_tokens_seen": 34597457472, "step": 66000 }, { "epoch": 0.6301203238847085, "grad_norm": 0.14043961465358734, "learning_rate": 0.001, "loss": 2.1347, "num_input_tokens_seen": 34623670848, "step": 66050 }, { "epoch": 0.6305973264008967, "grad_norm": 0.14886051416397095, "learning_rate": 0.001, "loss": 2.1338, "num_input_tokens_seen": 34649880224, "step": 66100 }, { "epoch": 0.631074328917085, "grad_norm": 0.14698228240013123, "learning_rate": 0.001, "loss": 2.1375, "num_input_tokens_seen": 34676090752, "step": 66150 }, { "epoch": 0.6315513314332734, "grad_norm": 0.1423393338918686, "learning_rate": 0.001, "loss": 2.1377, "num_input_tokens_seen": 34702305152, "step": 66200 }, { "epoch": 0.6320283339494616, "grad_norm": 0.1530950367450714, "learning_rate": 0.001, "loss": 2.1421, "num_input_tokens_seen": 34728510528, "step": 66250 }, { "epoch": 0.6325053364656499, "grad_norm": 0.14289388060569763, "learning_rate": 0.001, "loss": 2.1482, "num_input_tokens_seen": 34754723296, "step": 66300 }, { "epoch": 0.6329823389818381, "grad_norm": 0.15754513442516327, "learning_rate": 0.001, "loss": 2.1319, "num_input_tokens_seen": 34780937152, "step": 66350 }, { "epoch": 0.6334593414980264, "grad_norm": 0.14707081019878387, "learning_rate": 0.001, "loss": 2.1441, "num_input_tokens_seen": 34807151552, "step": 66400 }, { "epoch": 0.6339363440142147, "grad_norm": 0.13461631536483765, "learning_rate": 0.001, "loss": 2.1467, "num_input_tokens_seen": 34833363968, "step": 66450 }, { "epoch": 0.6344133465304029, "grad_norm": 0.14467968046665192, "learning_rate": 0.001, "loss": 2.1409, "num_input_tokens_seen": 34859578368, "step": 66500 }, { "epoch": 0.6344133465304029, "eval_loss": 2.0547854900360107, "eval_runtime": 80.2151, "eval_samples_per_second": 62.332, "eval_steps_per_second": 15.583, "num_input_tokens_seen": 34859578368, "step": 66500 }, { "epoch": 0.6348903490465913, "grad_norm": 0.14870643615722656, "learning_rate": 0.001, "loss": 2.1386, "num_input_tokens_seen": 34885791584, "step": 66550 }, { "epoch": 0.6353673515627795, "grad_norm": 0.15154273808002472, "learning_rate": 0.001, "loss": 2.1329, "num_input_tokens_seen": 34911990016, "step": 66600 }, { "epoch": 0.6358443540789678, "grad_norm": 0.14637821912765503, "learning_rate": 0.001, "loss": 2.1307, "num_input_tokens_seen": 34938204416, "step": 66650 }, { "epoch": 0.636321356595156, "grad_norm": 0.15013527870178223, "learning_rate": 0.001, "loss": 2.1406, "num_input_tokens_seen": 34964415968, "step": 66700 }, { "epoch": 0.6367983591113443, "grad_norm": 0.14377915859222412, "learning_rate": 0.001, "loss": 2.1395, "num_input_tokens_seen": 34990625568, "step": 66750 }, { "epoch": 0.6372753616275326, "grad_norm": 0.14643235504627228, "learning_rate": 0.001, "loss": 2.1416, "num_input_tokens_seen": 35016834496, "step": 66800 }, { "epoch": 0.6377523641437208, "grad_norm": 0.1544150412082672, "learning_rate": 0.001, "loss": 2.1344, "num_input_tokens_seen": 35043044928, "step": 66850 }, { "epoch": 0.6382293666599091, "grad_norm": 0.15571437776088715, "learning_rate": 0.001, "loss": 2.1259, "num_input_tokens_seen": 35069259328, "step": 66900 }, { "epoch": 0.6387063691760974, "grad_norm": 0.15925458073616028, "learning_rate": 0.001, "loss": 2.1329, "num_input_tokens_seen": 35095468736, "step": 66950 }, { "epoch": 0.6391833716922857, "grad_norm": 0.14532826840877533, "learning_rate": 0.001, "loss": 2.1368, "num_input_tokens_seen": 35121682240, "step": 67000 }, { "epoch": 0.6391833716922857, "eval_loss": 2.055466651916504, "eval_runtime": 80.6411, "eval_samples_per_second": 62.003, "eval_steps_per_second": 15.501, "num_input_tokens_seen": 35121682240, "step": 67000 }, { "epoch": 0.639660374208474, "grad_norm": 0.13905447721481323, "learning_rate": 0.001, "loss": 2.133, "num_input_tokens_seen": 35147896096, "step": 67050 }, { "epoch": 0.6401373767246622, "grad_norm": 0.14530105888843536, "learning_rate": 0.001, "loss": 2.1422, "num_input_tokens_seen": 35174110496, "step": 67100 }, { "epoch": 0.6406143792408505, "grad_norm": 0.13817119598388672, "learning_rate": 0.001, "loss": 2.1362, "num_input_tokens_seen": 35200323040, "step": 67150 }, { "epoch": 0.6410913817570387, "grad_norm": 0.14851312339305878, "learning_rate": 0.001, "loss": 2.1424, "num_input_tokens_seen": 35226535808, "step": 67200 }, { "epoch": 0.641568384273227, "grad_norm": 0.13070625066757202, "learning_rate": 0.001, "loss": 2.1355, "num_input_tokens_seen": 35252750208, "step": 67250 }, { "epoch": 0.6420453867894154, "grad_norm": 0.1390138566493988, "learning_rate": 0.001, "loss": 2.1422, "num_input_tokens_seen": 35278951616, "step": 67300 }, { "epoch": 0.6425223893056036, "grad_norm": 0.1507682204246521, "learning_rate": 0.001, "loss": 2.1436, "num_input_tokens_seen": 35305166016, "step": 67350 }, { "epoch": 0.6429993918217919, "grad_norm": 0.14612345397472382, "learning_rate": 0.001, "loss": 2.1408, "num_input_tokens_seen": 35331376032, "step": 67400 }, { "epoch": 0.6434763943379801, "grad_norm": 0.1487749069929123, "learning_rate": 0.001, "loss": 2.1365, "num_input_tokens_seen": 35357590432, "step": 67450 }, { "epoch": 0.6439533968541684, "grad_norm": 0.14507658779621124, "learning_rate": 0.001, "loss": 2.1366, "num_input_tokens_seen": 35383796224, "step": 67500 }, { "epoch": 0.6439533968541684, "eval_loss": 2.054283380508423, "eval_runtime": 80.2964, "eval_samples_per_second": 62.269, "eval_steps_per_second": 15.567, "num_input_tokens_seen": 35383796224, "step": 67500 }, { "epoch": 0.6444303993703567, "grad_norm": 0.13577920198440552, "learning_rate": 0.001, "loss": 2.1375, "num_input_tokens_seen": 35410002592, "step": 67550 }, { "epoch": 0.6449074018865449, "grad_norm": 0.15499469637870789, "learning_rate": 0.001, "loss": 2.1412, "num_input_tokens_seen": 35436216736, "step": 67600 }, { "epoch": 0.6453844044027333, "grad_norm": 0.20250071585178375, "learning_rate": 0.001, "loss": 2.147, "num_input_tokens_seen": 35462413280, "step": 67650 }, { "epoch": 0.6458614069189215, "grad_norm": 0.1439153552055359, "learning_rate": 0.001, "loss": 2.1466, "num_input_tokens_seen": 35488624192, "step": 67700 }, { "epoch": 0.6463384094351098, "grad_norm": 0.153683140873909, "learning_rate": 0.001, "loss": 2.1469, "num_input_tokens_seen": 35514838592, "step": 67750 }, { "epoch": 0.646815411951298, "grad_norm": 0.12951679527759552, "learning_rate": 0.001, "loss": 2.1405, "num_input_tokens_seen": 35541050208, "step": 67800 }, { "epoch": 0.6472924144674863, "grad_norm": 0.14119641482830048, "learning_rate": 0.001, "loss": 2.1339, "num_input_tokens_seen": 35567256480, "step": 67850 }, { "epoch": 0.6477694169836746, "grad_norm": 0.15403713285923004, "learning_rate": 0.001, "loss": 2.1376, "num_input_tokens_seen": 35593470880, "step": 67900 }, { "epoch": 0.6482464194998628, "grad_norm": 0.14498727023601532, "learning_rate": 0.001, "loss": 2.1461, "num_input_tokens_seen": 35619682656, "step": 67950 }, { "epoch": 0.6487234220160512, "grad_norm": 0.13579486310482025, "learning_rate": 0.001, "loss": 2.137, "num_input_tokens_seen": 35645894016, "step": 68000 }, { "epoch": 0.6487234220160512, "eval_loss": 2.0545461177825928, "eval_runtime": 79.8852, "eval_samples_per_second": 62.59, "eval_steps_per_second": 15.647, "num_input_tokens_seen": 35645894016, "step": 68000 }, { "epoch": 0.6492004245322394, "grad_norm": 0.15173807740211487, "learning_rate": 0.001, "loss": 2.1376, "num_input_tokens_seen": 35672102176, "step": 68050 }, { "epoch": 0.6496774270484277, "grad_norm": 0.13916128873825073, "learning_rate": 0.001, "loss": 2.1341, "num_input_tokens_seen": 35698316576, "step": 68100 }, { "epoch": 0.650154429564616, "grad_norm": 0.14804038405418396, "learning_rate": 0.001, "loss": 2.1359, "num_input_tokens_seen": 35724521824, "step": 68150 }, { "epoch": 0.6506314320808042, "grad_norm": 0.17799383401870728, "learning_rate": 0.001, "loss": 2.1309, "num_input_tokens_seen": 35750733344, "step": 68200 }, { "epoch": 0.6511084345969925, "grad_norm": 0.14043092727661133, "learning_rate": 0.001, "loss": 2.146, "num_input_tokens_seen": 35776946304, "step": 68250 }, { "epoch": 0.6515854371131807, "grad_norm": 0.13235658407211304, "learning_rate": 0.001, "loss": 2.1451, "num_input_tokens_seen": 35803159264, "step": 68300 }, { "epoch": 0.652062439629369, "grad_norm": 0.1564619392156601, "learning_rate": 0.001, "loss": 2.1384, "num_input_tokens_seen": 35829368896, "step": 68350 }, { "epoch": 0.6525394421455574, "grad_norm": 0.13722547888755798, "learning_rate": 0.001, "loss": 2.1321, "num_input_tokens_seen": 35855575680, "step": 68400 }, { "epoch": 0.6530164446617456, "grad_norm": 0.16468219459056854, "learning_rate": 0.001, "loss": 2.1276, "num_input_tokens_seen": 35881785632, "step": 68450 }, { "epoch": 0.6534934471779339, "grad_norm": 0.14734308421611786, "learning_rate": 0.001, "loss": 2.1342, "num_input_tokens_seen": 35907993664, "step": 68500 }, { "epoch": 0.6534934471779339, "eval_loss": 2.052057981491089, "eval_runtime": 79.7834, "eval_samples_per_second": 62.67, "eval_steps_per_second": 15.667, "num_input_tokens_seen": 35907993664, "step": 68500 }, { "epoch": 0.6539704496941221, "grad_norm": 0.1417292058467865, "learning_rate": 0.001, "loss": 2.1392, "num_input_tokens_seen": 35934201600, "step": 68550 }, { "epoch": 0.6544474522103104, "grad_norm": 0.1365908682346344, "learning_rate": 0.001, "loss": 2.1464, "num_input_tokens_seen": 35960403488, "step": 68600 }, { "epoch": 0.6549244547264986, "grad_norm": 0.149469256401062, "learning_rate": 0.001, "loss": 2.1472, "num_input_tokens_seen": 35986617888, "step": 68650 }, { "epoch": 0.655401457242687, "grad_norm": 0.13806110620498657, "learning_rate": 0.001, "loss": 2.1437, "num_input_tokens_seen": 36012828320, "step": 68700 }, { "epoch": 0.6558784597588753, "grad_norm": 0.15856076776981354, "learning_rate": 0.001, "loss": 2.1329, "num_input_tokens_seen": 36039041440, "step": 68750 }, { "epoch": 0.6563554622750635, "grad_norm": 0.14309062063694, "learning_rate": 0.001, "loss": 2.139, "num_input_tokens_seen": 36065255840, "step": 68800 }, { "epoch": 0.6568324647912518, "grad_norm": 0.14342211186885834, "learning_rate": 0.001, "loss": 2.1481, "num_input_tokens_seen": 36091469600, "step": 68850 }, { "epoch": 0.65730946730744, "grad_norm": 0.1576087325811386, "learning_rate": 0.001, "loss": 2.1431, "num_input_tokens_seen": 36117684000, "step": 68900 }, { "epoch": 0.6577864698236283, "grad_norm": 0.13672710955142975, "learning_rate": 0.001, "loss": 2.1319, "num_input_tokens_seen": 36143896192, "step": 68950 }, { "epoch": 0.6582634723398166, "grad_norm": 0.1360878199338913, "learning_rate": 0.001, "loss": 2.1388, "num_input_tokens_seen": 36170105088, "step": 69000 }, { "epoch": 0.6582634723398166, "eval_loss": 2.050727128982544, "eval_runtime": 79.5858, "eval_samples_per_second": 62.825, "eval_steps_per_second": 15.706, "num_input_tokens_seen": 36170105088, "step": 69000 }, { "epoch": 0.6587404748560048, "grad_norm": 0.13592034578323364, "learning_rate": 0.001, "loss": 2.1327, "num_input_tokens_seen": 36196315744, "step": 69050 }, { "epoch": 0.6592174773721932, "grad_norm": 0.14260712265968323, "learning_rate": 0.001, "loss": 2.1372, "num_input_tokens_seen": 36222527072, "step": 69100 }, { "epoch": 0.6596944798883814, "grad_norm": 0.15452657639980316, "learning_rate": 0.001, "loss": 2.1472, "num_input_tokens_seen": 36248731872, "step": 69150 }, { "epoch": 0.6601714824045697, "grad_norm": 0.1424446702003479, "learning_rate": 0.001, "loss": 2.1364, "num_input_tokens_seen": 36274937600, "step": 69200 }, { "epoch": 0.660648484920758, "grad_norm": 0.13716866075992584, "learning_rate": 0.001, "loss": 2.1312, "num_input_tokens_seen": 36301149984, "step": 69250 }, { "epoch": 0.6611254874369462, "grad_norm": 0.14809830486774445, "learning_rate": 0.001, "loss": 2.1404, "num_input_tokens_seen": 36327350112, "step": 69300 }, { "epoch": 0.6616024899531345, "grad_norm": 0.1415916383266449, "learning_rate": 0.001, "loss": 2.135, "num_input_tokens_seen": 36353559776, "step": 69350 }, { "epoch": 0.6620794924693227, "grad_norm": 0.14032308757305145, "learning_rate": 0.001, "loss": 2.1428, "num_input_tokens_seen": 36379772736, "step": 69400 }, { "epoch": 0.6625564949855111, "grad_norm": 0.14201749861240387, "learning_rate": 0.001, "loss": 2.1325, "num_input_tokens_seen": 36405987136, "step": 69450 }, { "epoch": 0.6630334975016994, "grad_norm": 0.14647674560546875, "learning_rate": 0.001, "loss": 2.1339, "num_input_tokens_seen": 36432197248, "step": 69500 }, { "epoch": 0.6630334975016994, "eval_loss": 2.050248384475708, "eval_runtime": 79.7056, "eval_samples_per_second": 62.731, "eval_steps_per_second": 15.683, "num_input_tokens_seen": 36432197248, "step": 69500 }, { "epoch": 0.6635105000178876, "grad_norm": 0.1516619324684143, "learning_rate": 0.001, "loss": 2.1328, "num_input_tokens_seen": 36458411168, "step": 69550 }, { "epoch": 0.6639875025340759, "grad_norm": 0.1529875546693802, "learning_rate": 0.001, "loss": 2.1337, "num_input_tokens_seen": 36484615936, "step": 69600 }, { "epoch": 0.6644645050502641, "grad_norm": 0.14783112704753876, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 36510827520, "step": 69650 }, { "epoch": 0.6649415075664524, "grad_norm": 0.1420241892337799, "learning_rate": 0.001, "loss": 2.1273, "num_input_tokens_seen": 36537041920, "step": 69700 }, { "epoch": 0.6654185100826406, "grad_norm": 0.15910027921199799, "learning_rate": 0.001, "loss": 2.1318, "num_input_tokens_seen": 36563256224, "step": 69750 }, { "epoch": 0.665895512598829, "grad_norm": 0.14894790947437286, "learning_rate": 0.001, "loss": 2.1453, "num_input_tokens_seen": 36589470624, "step": 69800 }, { "epoch": 0.6663725151150173, "grad_norm": 0.14341433346271515, "learning_rate": 0.001, "loss": 2.1341, "num_input_tokens_seen": 36615684320, "step": 69850 }, { "epoch": 0.6668495176312055, "grad_norm": 0.1415243148803711, "learning_rate": 0.001, "loss": 2.135, "num_input_tokens_seen": 36641887808, "step": 69900 }, { "epoch": 0.6673265201473938, "grad_norm": 0.1512666493654251, "learning_rate": 0.001, "loss": 2.128, "num_input_tokens_seen": 36668096416, "step": 69950 }, { "epoch": 0.667803522663582, "grad_norm": 0.1558062583208084, "learning_rate": 0.001, "loss": 2.118, "num_input_tokens_seen": 36694303392, "step": 70000 }, { "epoch": 0.667803522663582, "eval_loss": 2.050658941268921, "eval_runtime": 80.2312, "eval_samples_per_second": 62.32, "eval_steps_per_second": 15.58, "num_input_tokens_seen": 36694303392, "step": 70000 }, { "epoch": 0.6682805251797703, "grad_norm": 0.14296776056289673, "learning_rate": 0.001, "loss": 2.1323, "num_input_tokens_seen": 36720513888, "step": 70050 }, { "epoch": 0.6687575276959586, "grad_norm": 0.15423347055912018, "learning_rate": 0.001, "loss": 2.132, "num_input_tokens_seen": 36746724864, "step": 70100 }, { "epoch": 0.6692345302121468, "grad_norm": 0.155342698097229, "learning_rate": 0.001, "loss": 2.1362, "num_input_tokens_seen": 36772930208, "step": 70150 }, { "epoch": 0.6697115327283352, "grad_norm": 0.14429853856563568, "learning_rate": 0.001, "loss": 2.1307, "num_input_tokens_seen": 36799144608, "step": 70200 }, { "epoch": 0.6701885352445234, "grad_norm": 0.14069730043411255, "learning_rate": 0.001, "loss": 2.1386, "num_input_tokens_seen": 36825352256, "step": 70250 }, { "epoch": 0.6706655377607117, "grad_norm": 0.158811554312706, "learning_rate": 0.001, "loss": 2.1258, "num_input_tokens_seen": 36851566656, "step": 70300 }, { "epoch": 0.6711425402769, "grad_norm": 0.13650204241275787, "learning_rate": 0.001, "loss": 2.1176, "num_input_tokens_seen": 36877777984, "step": 70350 }, { "epoch": 0.6716195427930882, "grad_norm": 0.1499445140361786, "learning_rate": 0.001, "loss": 2.1329, "num_input_tokens_seen": 36903992384, "step": 70400 }, { "epoch": 0.6720965453092765, "grad_norm": 0.162213996052742, "learning_rate": 0.001, "loss": 2.1241, "num_input_tokens_seen": 36930204640, "step": 70450 }, { "epoch": 0.6725735478254647, "grad_norm": 0.13957861065864563, "learning_rate": 0.001, "loss": 2.134, "num_input_tokens_seen": 36956418880, "step": 70500 }, { "epoch": 0.6725735478254647, "eval_loss": 2.0487587451934814, "eval_runtime": 79.4988, "eval_samples_per_second": 62.894, "eval_steps_per_second": 15.724, "num_input_tokens_seen": 36956418880, "step": 70500 }, { "epoch": 0.6730505503416531, "grad_norm": 0.13786406815052032, "learning_rate": 0.001, "loss": 2.1295, "num_input_tokens_seen": 36982628800, "step": 70550 }, { "epoch": 0.6735275528578413, "grad_norm": 0.13988524675369263, "learning_rate": 0.001, "loss": 2.1399, "num_input_tokens_seen": 37008836032, "step": 70600 }, { "epoch": 0.6740045553740296, "grad_norm": 0.14156313240528107, "learning_rate": 0.001, "loss": 2.1389, "num_input_tokens_seen": 37035035264, "step": 70650 }, { "epoch": 0.6744815578902179, "grad_norm": 0.13705122470855713, "learning_rate": 0.001, "loss": 2.1327, "num_input_tokens_seen": 37061248224, "step": 70700 }, { "epoch": 0.6749585604064061, "grad_norm": 0.1541953831911087, "learning_rate": 0.001, "loss": 2.1386, "num_input_tokens_seen": 37087461408, "step": 70750 }, { "epoch": 0.6754355629225944, "grad_norm": 0.1509193331003189, "learning_rate": 0.001, "loss": 2.1368, "num_input_tokens_seen": 37113675808, "step": 70800 }, { "epoch": 0.6759125654387826, "grad_norm": 0.14552246034145355, "learning_rate": 0.001, "loss": 2.1467, "num_input_tokens_seen": 37139883072, "step": 70850 }, { "epoch": 0.676389567954971, "grad_norm": 0.1387251317501068, "learning_rate": 0.001, "loss": 2.1255, "num_input_tokens_seen": 37166094176, "step": 70900 }, { "epoch": 0.6768665704711593, "grad_norm": 0.15626934170722961, "learning_rate": 0.001, "loss": 2.1268, "num_input_tokens_seen": 37192306208, "step": 70950 }, { "epoch": 0.6773435729873475, "grad_norm": 0.14792028069496155, "learning_rate": 0.001, "loss": 2.1461, "num_input_tokens_seen": 37218518048, "step": 71000 }, { "epoch": 0.6773435729873475, "eval_loss": 2.049234390258789, "eval_runtime": 80.3085, "eval_samples_per_second": 62.26, "eval_steps_per_second": 15.565, "num_input_tokens_seen": 37218518048, "step": 71000 }, { "epoch": 0.6778205755035358, "grad_norm": 0.16609162092208862, "learning_rate": 0.001, "loss": 2.1336, "num_input_tokens_seen": 37244730912, "step": 71050 }, { "epoch": 0.678297578019724, "grad_norm": 0.15477871894836426, "learning_rate": 0.001, "loss": 2.1432, "num_input_tokens_seen": 37270939744, "step": 71100 }, { "epoch": 0.6787745805359123, "grad_norm": 0.1837802231311798, "learning_rate": 0.001, "loss": 2.1372, "num_input_tokens_seen": 37297151424, "step": 71150 }, { "epoch": 0.6792515830521006, "grad_norm": 0.14492639899253845, "learning_rate": 0.001, "loss": 2.1495, "num_input_tokens_seen": 37323357760, "step": 71200 }, { "epoch": 0.6797285855682889, "grad_norm": 0.14435459673404694, "learning_rate": 0.001, "loss": 2.1208, "num_input_tokens_seen": 37349568992, "step": 71250 }, { "epoch": 0.6802055880844772, "grad_norm": 0.1369018405675888, "learning_rate": 0.001, "loss": 2.1294, "num_input_tokens_seen": 37375769344, "step": 71300 }, { "epoch": 0.6806825906006654, "grad_norm": 0.15272092819213867, "learning_rate": 0.001, "loss": 2.1416, "num_input_tokens_seen": 37401976768, "step": 71350 }, { "epoch": 0.6811595931168537, "grad_norm": 0.13770927488803864, "learning_rate": 0.001, "loss": 2.1229, "num_input_tokens_seen": 37428186944, "step": 71400 }, { "epoch": 0.681636595633042, "grad_norm": 0.13732831180095673, "learning_rate": 0.001, "loss": 2.1322, "num_input_tokens_seen": 37454401344, "step": 71450 }, { "epoch": 0.6821135981492302, "grad_norm": 0.14253439009189606, "learning_rate": 0.001, "loss": 2.1361, "num_input_tokens_seen": 37480608672, "step": 71500 }, { "epoch": 0.6821135981492302, "eval_loss": 2.047884464263916, "eval_runtime": 80.2076, "eval_samples_per_second": 62.338, "eval_steps_per_second": 15.585, "num_input_tokens_seen": 37480608672, "step": 71500 }, { "epoch": 0.6825906006654185, "grad_norm": 0.14250271022319794, "learning_rate": 0.001, "loss": 2.1243, "num_input_tokens_seen": 37506820512, "step": 71550 }, { "epoch": 0.6830676031816068, "grad_norm": 0.14131279289722443, "learning_rate": 0.001, "loss": 2.1271, "num_input_tokens_seen": 37533034912, "step": 71600 }, { "epoch": 0.6835446056977951, "grad_norm": 0.1426624059677124, "learning_rate": 0.001, "loss": 2.1407, "num_input_tokens_seen": 37559241280, "step": 71650 }, { "epoch": 0.6840216082139833, "grad_norm": 0.15065455436706543, "learning_rate": 0.001, "loss": 2.1409, "num_input_tokens_seen": 37585440352, "step": 71700 }, { "epoch": 0.6844986107301716, "grad_norm": 0.15656264126300812, "learning_rate": 0.001, "loss": 2.1375, "num_input_tokens_seen": 37611650048, "step": 71750 }, { "epoch": 0.6849756132463599, "grad_norm": 0.15184299647808075, "learning_rate": 0.001, "loss": 2.1341, "num_input_tokens_seen": 37637858912, "step": 71800 }, { "epoch": 0.6854526157625481, "grad_norm": 0.14735595881938934, "learning_rate": 0.001, "loss": 2.1262, "num_input_tokens_seen": 37664073312, "step": 71850 }, { "epoch": 0.6859296182787364, "grad_norm": 0.13618548214435577, "learning_rate": 0.001, "loss": 2.1271, "num_input_tokens_seen": 37690272480, "step": 71900 }, { "epoch": 0.6864066207949246, "grad_norm": 0.15221554040908813, "learning_rate": 0.001, "loss": 2.1182, "num_input_tokens_seen": 37716486080, "step": 71950 }, { "epoch": 0.686883623311113, "grad_norm": 0.14794082939624786, "learning_rate": 0.001, "loss": 2.1369, "num_input_tokens_seen": 37742700480, "step": 72000 }, { "epoch": 0.686883623311113, "eval_loss": 2.0480294227600098, "eval_runtime": 80.2069, "eval_samples_per_second": 62.339, "eval_steps_per_second": 15.585, "num_input_tokens_seen": 37742700480, "step": 72000 }, { "epoch": 0.6873606258273013, "grad_norm": 0.1561603844165802, "learning_rate": 0.001, "loss": 2.1356, "num_input_tokens_seen": 37768907520, "step": 72050 }, { "epoch": 0.6878376283434895, "grad_norm": 0.1416538655757904, "learning_rate": 0.001, "loss": 2.1286, "num_input_tokens_seen": 37795120992, "step": 72100 }, { "epoch": 0.6883146308596778, "grad_norm": 0.13913485407829285, "learning_rate": 0.001, "loss": 2.1363, "num_input_tokens_seen": 37821335072, "step": 72150 }, { "epoch": 0.688791633375866, "grad_norm": 0.14764831960201263, "learning_rate": 0.001, "loss": 2.1218, "num_input_tokens_seen": 37847540704, "step": 72200 }, { "epoch": 0.6892686358920543, "grad_norm": 0.1435699462890625, "learning_rate": 0.001, "loss": 2.1382, "num_input_tokens_seen": 37873742272, "step": 72250 }, { "epoch": 0.6897456384082427, "grad_norm": 0.13604077696800232, "learning_rate": 0.001, "loss": 2.1406, "num_input_tokens_seen": 37899949728, "step": 72300 }, { "epoch": 0.6902226409244309, "grad_norm": 0.1389516144990921, "learning_rate": 0.001, "loss": 2.1345, "num_input_tokens_seen": 37926155104, "step": 72350 }, { "epoch": 0.6906996434406192, "grad_norm": 0.15023711323738098, "learning_rate": 0.001, "loss": 2.122, "num_input_tokens_seen": 37952368992, "step": 72400 }, { "epoch": 0.6911766459568074, "grad_norm": 0.1581972986459732, "learning_rate": 0.001, "loss": 2.1381, "num_input_tokens_seen": 37978578048, "step": 72450 }, { "epoch": 0.6916536484729957, "grad_norm": 0.1558607965707779, "learning_rate": 0.001, "loss": 2.1369, "num_input_tokens_seen": 38004789088, "step": 72500 }, { "epoch": 0.6916536484729957, "eval_loss": 2.0476512908935547, "eval_runtime": 80.2302, "eval_samples_per_second": 62.321, "eval_steps_per_second": 15.58, "num_input_tokens_seen": 38004789088, "step": 72500 }, { "epoch": 0.6921306509891839, "grad_norm": 0.15057148039340973, "learning_rate": 0.001, "loss": 2.1315, "num_input_tokens_seen": 38030998464, "step": 72550 }, { "epoch": 0.6926076535053722, "grad_norm": 0.1446782648563385, "learning_rate": 0.001, "loss": 2.1385, "num_input_tokens_seen": 38057210368, "step": 72600 }, { "epoch": 0.6930846560215606, "grad_norm": 0.15265704691410065, "learning_rate": 0.001, "loss": 2.1313, "num_input_tokens_seen": 38083424608, "step": 72650 }, { "epoch": 0.6935616585377488, "grad_norm": 0.15340076386928558, "learning_rate": 0.001, "loss": 2.1335, "num_input_tokens_seen": 38109639008, "step": 72700 }, { "epoch": 0.6940386610539371, "grad_norm": 0.15052905678749084, "learning_rate": 0.001, "loss": 2.1291, "num_input_tokens_seen": 38135849184, "step": 72750 }, { "epoch": 0.6945156635701253, "grad_norm": 0.1488681584596634, "learning_rate": 0.001, "loss": 2.1269, "num_input_tokens_seen": 38162055520, "step": 72800 }, { "epoch": 0.6949926660863136, "grad_norm": 0.14417092502117157, "learning_rate": 0.001, "loss": 2.1291, "num_input_tokens_seen": 38188267136, "step": 72850 }, { "epoch": 0.6954696686025019, "grad_norm": 0.14182248711585999, "learning_rate": 0.001, "loss": 2.1369, "num_input_tokens_seen": 38214476512, "step": 72900 }, { "epoch": 0.6959466711186901, "grad_norm": 0.13842789828777313, "learning_rate": 0.001, "loss": 2.1487, "num_input_tokens_seen": 38240668640, "step": 72950 }, { "epoch": 0.6964236736348784, "grad_norm": 0.13514494895935059, "learning_rate": 0.001, "loss": 2.1379, "num_input_tokens_seen": 38266876896, "step": 73000 }, { "epoch": 0.6964236736348784, "eval_loss": 2.0458996295928955, "eval_runtime": 80.4905, "eval_samples_per_second": 62.119, "eval_steps_per_second": 15.53, "num_input_tokens_seen": 38266876896, "step": 73000 }, { "epoch": 0.6969006761510667, "grad_norm": 0.14907589554786682, "learning_rate": 0.001, "loss": 2.118, "num_input_tokens_seen": 38293080608, "step": 73050 }, { "epoch": 0.697377678667255, "grad_norm": 0.1431187242269516, "learning_rate": 0.001, "loss": 2.1337, "num_input_tokens_seen": 38319293216, "step": 73100 }, { "epoch": 0.6978546811834433, "grad_norm": 0.15081514418125153, "learning_rate": 0.001, "loss": 2.1272, "num_input_tokens_seen": 38345504128, "step": 73150 }, { "epoch": 0.6983316836996315, "grad_norm": 0.16645316779613495, "learning_rate": 0.001, "loss": 2.1272, "num_input_tokens_seen": 38371716608, "step": 73200 }, { "epoch": 0.6988086862158198, "grad_norm": 0.13646705448627472, "learning_rate": 0.001, "loss": 2.1267, "num_input_tokens_seen": 38397931008, "step": 73250 }, { "epoch": 0.699285688732008, "grad_norm": 0.147465780377388, "learning_rate": 0.001, "loss": 2.1372, "num_input_tokens_seen": 38424145312, "step": 73300 }, { "epoch": 0.6997626912481963, "grad_norm": 0.15060865879058838, "learning_rate": 0.001, "loss": 2.1253, "num_input_tokens_seen": 38450358240, "step": 73350 }, { "epoch": 0.7002396937643846, "grad_norm": 0.14528082311153412, "learning_rate": 0.001, "loss": 2.1285, "num_input_tokens_seen": 38476567968, "step": 73400 }, { "epoch": 0.7007166962805729, "grad_norm": 0.15923307836055756, "learning_rate": 0.001, "loss": 2.1345, "num_input_tokens_seen": 38502782336, "step": 73450 }, { "epoch": 0.7011936987967612, "grad_norm": 0.1410328447818756, "learning_rate": 0.001, "loss": 2.1235, "num_input_tokens_seen": 38528990144, "step": 73500 }, { "epoch": 0.7011936987967612, "eval_loss": 2.045837163925171, "eval_runtime": 79.7863, "eval_samples_per_second": 62.667, "eval_steps_per_second": 15.667, "num_input_tokens_seen": 38528990144, "step": 73500 }, { "epoch": 0.7016707013129494, "grad_norm": 0.13489565253257751, "learning_rate": 0.001, "loss": 2.1331, "num_input_tokens_seen": 38555187872, "step": 73550 }, { "epoch": 0.7021477038291377, "grad_norm": 0.15437881648540497, "learning_rate": 0.001, "loss": 2.1341, "num_input_tokens_seen": 38581402272, "step": 73600 }, { "epoch": 0.7026247063453259, "grad_norm": 0.1449405401945114, "learning_rate": 0.001, "loss": 2.1244, "num_input_tokens_seen": 38607616672, "step": 73650 }, { "epoch": 0.7031017088615142, "grad_norm": 0.1428922414779663, "learning_rate": 0.001, "loss": 2.1309, "num_input_tokens_seen": 38633831072, "step": 73700 }, { "epoch": 0.7035787113777026, "grad_norm": 0.14642202854156494, "learning_rate": 0.001, "loss": 2.1343, "num_input_tokens_seen": 38660044448, "step": 73750 }, { "epoch": 0.7040557138938908, "grad_norm": 0.138772115111351, "learning_rate": 0.001, "loss": 2.1299, "num_input_tokens_seen": 38686253024, "step": 73800 }, { "epoch": 0.7045327164100791, "grad_norm": 0.14213278889656067, "learning_rate": 0.001, "loss": 2.1238, "num_input_tokens_seen": 38712461152, "step": 73850 }, { "epoch": 0.7050097189262673, "grad_norm": 0.13823473453521729, "learning_rate": 0.001, "loss": 2.1197, "num_input_tokens_seen": 38738675552, "step": 73900 }, { "epoch": 0.7054867214424556, "grad_norm": 0.14536434412002563, "learning_rate": 0.001, "loss": 2.1276, "num_input_tokens_seen": 38764889952, "step": 73950 }, { "epoch": 0.7059637239586439, "grad_norm": 0.1466161459684372, "learning_rate": 0.001, "loss": 2.127, "num_input_tokens_seen": 38791103008, "step": 74000 }, { "epoch": 0.7059637239586439, "eval_loss": 2.045396566390991, "eval_runtime": 79.9741, "eval_samples_per_second": 62.52, "eval_steps_per_second": 15.63, "num_input_tokens_seen": 38791103008, "step": 74000 }, { "epoch": 0.7064407264748321, "grad_norm": 0.13936443626880646, "learning_rate": 0.001, "loss": 2.1354, "num_input_tokens_seen": 38817315392, "step": 74050 }, { "epoch": 0.7069177289910205, "grad_norm": 0.14214852452278137, "learning_rate": 0.001, "loss": 2.1254, "num_input_tokens_seen": 38843526016, "step": 74100 }, { "epoch": 0.7073947315072087, "grad_norm": 0.133310005068779, "learning_rate": 0.001, "loss": 2.1199, "num_input_tokens_seen": 38869733760, "step": 74150 }, { "epoch": 0.707871734023397, "grad_norm": 0.1473781317472458, "learning_rate": 0.001, "loss": 2.1187, "num_input_tokens_seen": 38895948160, "step": 74200 }, { "epoch": 0.7083487365395853, "grad_norm": 0.15462498366832733, "learning_rate": 0.001, "loss": 2.1318, "num_input_tokens_seen": 38922156384, "step": 74250 }, { "epoch": 0.7088257390557735, "grad_norm": 0.13819323480129242, "learning_rate": 0.001, "loss": 2.1345, "num_input_tokens_seen": 38948364896, "step": 74300 }, { "epoch": 0.7093027415719618, "grad_norm": 0.14366789162158966, "learning_rate": 0.001, "loss": 2.1295, "num_input_tokens_seen": 38974568576, "step": 74350 }, { "epoch": 0.70977974408815, "grad_norm": 0.1546156257390976, "learning_rate": 0.001, "loss": 2.1398, "num_input_tokens_seen": 39000771968, "step": 74400 }, { "epoch": 0.7102567466043384, "grad_norm": 0.14302626252174377, "learning_rate": 0.001, "loss": 2.1276, "num_input_tokens_seen": 39026975968, "step": 74450 }, { "epoch": 0.7107337491205266, "grad_norm": 0.14276665449142456, "learning_rate": 0.001, "loss": 2.1362, "num_input_tokens_seen": 39053184384, "step": 74500 }, { "epoch": 0.7107337491205266, "eval_loss": 2.0453200340270996, "eval_runtime": 80.1033, "eval_samples_per_second": 62.419, "eval_steps_per_second": 15.605, "num_input_tokens_seen": 39053184384, "step": 74500 }, { "epoch": 0.7112107516367149, "grad_norm": 0.15070320665836334, "learning_rate": 0.001, "loss": 2.1197, "num_input_tokens_seen": 39079385760, "step": 74550 }, { "epoch": 0.7116877541529032, "grad_norm": 0.14792390167713165, "learning_rate": 0.001, "loss": 2.1332, "num_input_tokens_seen": 39105597568, "step": 74600 }, { "epoch": 0.7121647566690914, "grad_norm": 0.15546678006649017, "learning_rate": 0.001, "loss": 2.1315, "num_input_tokens_seen": 39131798528, "step": 74650 }, { "epoch": 0.7126417591852797, "grad_norm": 0.15446694195270538, "learning_rate": 0.001, "loss": 2.1346, "num_input_tokens_seen": 39158006240, "step": 74700 }, { "epoch": 0.7131187617014679, "grad_norm": 0.14489658176898956, "learning_rate": 0.001, "loss": 2.1217, "num_input_tokens_seen": 39184214656, "step": 74750 }, { "epoch": 0.7135957642176562, "grad_norm": 0.14391835033893585, "learning_rate": 0.001, "loss": 2.1318, "num_input_tokens_seen": 39210425120, "step": 74800 }, { "epoch": 0.7140727667338446, "grad_norm": 0.1562168151140213, "learning_rate": 0.001, "loss": 2.1367, "num_input_tokens_seen": 39236633824, "step": 74850 }, { "epoch": 0.7145497692500328, "grad_norm": 0.14505062997341156, "learning_rate": 0.001, "loss": 2.126, "num_input_tokens_seen": 39262845056, "step": 74900 }, { "epoch": 0.7150267717662211, "grad_norm": 0.17240794003009796, "learning_rate": 0.001, "loss": 2.1316, "num_input_tokens_seen": 39289059456, "step": 74950 }, { "epoch": 0.7155037742824093, "grad_norm": 0.14480435848236084, "learning_rate": 0.001, "loss": 2.1096, "num_input_tokens_seen": 39315259072, "step": 75000 }, { "epoch": 0.7155037742824093, "eval_loss": 2.0447704792022705, "eval_runtime": 80.0622, "eval_samples_per_second": 62.451, "eval_steps_per_second": 15.613, "num_input_tokens_seen": 39315259072, "step": 75000 }, { "epoch": 0.7159807767985976, "grad_norm": 0.15248794853687286, "learning_rate": 0.001, "loss": 2.1311, "num_input_tokens_seen": 39341461440, "step": 75050 }, { "epoch": 0.7164577793147859, "grad_norm": 0.13991257548332214, "learning_rate": 0.001, "loss": 2.142, "num_input_tokens_seen": 39367670304, "step": 75100 }, { "epoch": 0.7169347818309741, "grad_norm": 0.14684896171092987, "learning_rate": 0.001, "loss": 2.131, "num_input_tokens_seen": 39393880928, "step": 75150 }, { "epoch": 0.7174117843471625, "grad_norm": 0.14778843522071838, "learning_rate": 0.001, "loss": 2.1185, "num_input_tokens_seen": 39420094080, "step": 75200 }, { "epoch": 0.7178887868633507, "grad_norm": 0.14234404265880585, "learning_rate": 0.001, "loss": 2.1172, "num_input_tokens_seen": 39446305376, "step": 75250 }, { "epoch": 0.718365789379539, "grad_norm": 0.1400527060031891, "learning_rate": 0.001, "loss": 2.1333, "num_input_tokens_seen": 39472510304, "step": 75300 }, { "epoch": 0.7188427918957272, "grad_norm": 0.14747172594070435, "learning_rate": 0.001, "loss": 2.1369, "num_input_tokens_seen": 39498722784, "step": 75350 }, { "epoch": 0.7193197944119155, "grad_norm": 0.14818298816680908, "learning_rate": 0.001, "loss": 2.1375, "num_input_tokens_seen": 39524932416, "step": 75400 }, { "epoch": 0.7197967969281038, "grad_norm": 0.14356687664985657, "learning_rate": 0.001, "loss": 2.1334, "num_input_tokens_seen": 39551136768, "step": 75450 }, { "epoch": 0.720273799444292, "grad_norm": 0.12981760501861572, "learning_rate": 0.001, "loss": 2.1273, "num_input_tokens_seen": 39577349376, "step": 75500 }, { "epoch": 0.720273799444292, "eval_loss": 2.0432538986206055, "eval_runtime": 80.0711, "eval_samples_per_second": 62.445, "eval_steps_per_second": 15.611, "num_input_tokens_seen": 39577349376, "step": 75500 }, { "epoch": 0.7207508019604804, "grad_norm": 0.14318658411502838, "learning_rate": 0.001, "loss": 2.1299, "num_input_tokens_seen": 39603560448, "step": 75550 }, { "epoch": 0.7212278044766686, "grad_norm": 0.1411541849374771, "learning_rate": 0.001, "loss": 2.1298, "num_input_tokens_seen": 39629764544, "step": 75600 }, { "epoch": 0.7217048069928569, "grad_norm": 0.15290595591068268, "learning_rate": 0.001, "loss": 2.1194, "num_input_tokens_seen": 39655978944, "step": 75650 }, { "epoch": 0.7221818095090452, "grad_norm": 0.15146586298942566, "learning_rate": 0.001, "loss": 2.1226, "num_input_tokens_seen": 39682192608, "step": 75700 }, { "epoch": 0.7226588120252334, "grad_norm": 0.1520843505859375, "learning_rate": 0.001, "loss": 2.1375, "num_input_tokens_seen": 39708395360, "step": 75750 }, { "epoch": 0.7231358145414217, "grad_norm": 0.14785976707935333, "learning_rate": 0.001, "loss": 2.1279, "num_input_tokens_seen": 39734607232, "step": 75800 }, { "epoch": 0.7236128170576099, "grad_norm": 0.1379549652338028, "learning_rate": 0.001, "loss": 2.1234, "num_input_tokens_seen": 39760818368, "step": 75850 }, { "epoch": 0.7240898195737983, "grad_norm": 0.15331624448299408, "learning_rate": 0.001, "loss": 2.1288, "num_input_tokens_seen": 39787032768, "step": 75900 }, { "epoch": 0.7245668220899866, "grad_norm": 0.14259029924869537, "learning_rate": 0.001, "loss": 2.123, "num_input_tokens_seen": 39813244832, "step": 75950 }, { "epoch": 0.7250438246061748, "grad_norm": 0.14519137144088745, "learning_rate": 0.001, "loss": 2.1325, "num_input_tokens_seen": 39839459232, "step": 76000 }, { "epoch": 0.7250438246061748, "eval_loss": 2.043282985687256, "eval_runtime": 80.639, "eval_samples_per_second": 62.005, "eval_steps_per_second": 15.501, "num_input_tokens_seen": 39839459232, "step": 76000 }, { "epoch": 0.7255208271223631, "grad_norm": 0.14390450716018677, "learning_rate": 0.001, "loss": 2.1264, "num_input_tokens_seen": 39865673408, "step": 76050 }, { "epoch": 0.7259978296385513, "grad_norm": 0.14975307881832123, "learning_rate": 0.001, "loss": 2.1256, "num_input_tokens_seen": 39891887808, "step": 76100 }, { "epoch": 0.7264748321547396, "grad_norm": 0.1487993597984314, "learning_rate": 0.001, "loss": 2.1181, "num_input_tokens_seen": 39918100800, "step": 76150 }, { "epoch": 0.7269518346709279, "grad_norm": 0.13411827385425568, "learning_rate": 0.001, "loss": 2.1253, "num_input_tokens_seen": 39944315200, "step": 76200 }, { "epoch": 0.7274288371871162, "grad_norm": 0.14648018777370453, "learning_rate": 0.001, "loss": 2.1212, "num_input_tokens_seen": 39970507840, "step": 76250 }, { "epoch": 0.7279058397033045, "grad_norm": 0.14438115060329437, "learning_rate": 0.001, "loss": 2.124, "num_input_tokens_seen": 39996717120, "step": 76300 }, { "epoch": 0.7283828422194927, "grad_norm": 0.1373198926448822, "learning_rate": 0.001, "loss": 2.1333, "num_input_tokens_seen": 40022924960, "step": 76350 }, { "epoch": 0.728859844735681, "grad_norm": 0.1409999579191208, "learning_rate": 0.001, "loss": 2.1269, "num_input_tokens_seen": 40049132160, "step": 76400 }, { "epoch": 0.7293368472518692, "grad_norm": 0.15943694114685059, "learning_rate": 0.001, "loss": 2.1236, "num_input_tokens_seen": 40075334464, "step": 76450 }, { "epoch": 0.7298138497680575, "grad_norm": 0.14787088334560394, "learning_rate": 0.001, "loss": 2.1261, "num_input_tokens_seen": 40101539296, "step": 76500 }, { "epoch": 0.7298138497680575, "eval_loss": 2.042207717895508, "eval_runtime": 80.2748, "eval_samples_per_second": 62.286, "eval_steps_per_second": 15.572, "num_input_tokens_seen": 40101539296, "step": 76500 }, { "epoch": 0.7302908522842458, "grad_norm": 0.15151502192020416, "learning_rate": 0.001, "loss": 2.1314, "num_input_tokens_seen": 40127751296, "step": 76550 }, { "epoch": 0.730767854800434, "grad_norm": 0.13959145545959473, "learning_rate": 0.001, "loss": 2.1249, "num_input_tokens_seen": 40153964320, "step": 76600 }, { "epoch": 0.7312448573166224, "grad_norm": 0.13703158497810364, "learning_rate": 0.001, "loss": 2.1347, "num_input_tokens_seen": 40180178176, "step": 76650 }, { "epoch": 0.7317218598328106, "grad_norm": 0.1381351500749588, "learning_rate": 0.001, "loss": 2.1268, "num_input_tokens_seen": 40206390688, "step": 76700 }, { "epoch": 0.7321988623489989, "grad_norm": 0.14056669175624847, "learning_rate": 0.001, "loss": 2.1174, "num_input_tokens_seen": 40232601472, "step": 76750 }, { "epoch": 0.7326758648651872, "grad_norm": 0.1344117820262909, "learning_rate": 0.001, "loss": 2.1319, "num_input_tokens_seen": 40258813184, "step": 76800 }, { "epoch": 0.7331528673813754, "grad_norm": 0.14882655441761017, "learning_rate": 0.001, "loss": 2.1361, "num_input_tokens_seen": 40285020800, "step": 76850 }, { "epoch": 0.7336298698975637, "grad_norm": 0.1313314437866211, "learning_rate": 0.001, "loss": 2.1244, "num_input_tokens_seen": 40311227808, "step": 76900 }, { "epoch": 0.7341068724137519, "grad_norm": 0.13959497213363647, "learning_rate": 0.001, "loss": 2.1233, "num_input_tokens_seen": 40337436928, "step": 76950 }, { "epoch": 0.7345838749299403, "grad_norm": 0.16767309606075287, "learning_rate": 0.001, "loss": 2.1284, "num_input_tokens_seen": 40363651328, "step": 77000 }, { "epoch": 0.7345838749299403, "eval_loss": 2.041841983795166, "eval_runtime": 80.626, "eval_samples_per_second": 62.015, "eval_steps_per_second": 15.504, "num_input_tokens_seen": 40363651328, "step": 77000 }, { "epoch": 0.7350608774461286, "grad_norm": 0.14556396007537842, "learning_rate": 0.001, "loss": 2.1383, "num_input_tokens_seen": 40389861376, "step": 77050 }, { "epoch": 0.7355378799623168, "grad_norm": 0.16213080286979675, "learning_rate": 0.001, "loss": 2.1288, "num_input_tokens_seen": 40416069344, "step": 77100 }, { "epoch": 0.7360148824785051, "grad_norm": 0.15535910427570343, "learning_rate": 0.001, "loss": 2.1311, "num_input_tokens_seen": 40442276800, "step": 77150 }, { "epoch": 0.7364918849946933, "grad_norm": 0.14690810441970825, "learning_rate": 0.001, "loss": 2.1303, "num_input_tokens_seen": 40468487520, "step": 77200 }, { "epoch": 0.7369688875108816, "grad_norm": 0.1359778791666031, "learning_rate": 0.001, "loss": 2.1293, "num_input_tokens_seen": 40494701920, "step": 77250 }, { "epoch": 0.7374458900270698, "grad_norm": 0.1551726907491684, "learning_rate": 0.001, "loss": 2.1319, "num_input_tokens_seen": 40520907008, "step": 77300 }, { "epoch": 0.7379228925432582, "grad_norm": 0.1439419388771057, "learning_rate": 0.001, "loss": 2.1217, "num_input_tokens_seen": 40547111296, "step": 77350 }, { "epoch": 0.7383998950594465, "grad_norm": 0.15661780536174774, "learning_rate": 0.001, "loss": 2.1218, "num_input_tokens_seen": 40573317376, "step": 77400 }, { "epoch": 0.7388768975756347, "grad_norm": 0.15002021193504333, "learning_rate": 0.001, "loss": 2.127, "num_input_tokens_seen": 40599530048, "step": 77450 }, { "epoch": 0.739353900091823, "grad_norm": 0.148692324757576, "learning_rate": 0.001, "loss": 2.1152, "num_input_tokens_seen": 40625741536, "step": 77500 }, { "epoch": 0.739353900091823, "eval_loss": 2.0417163372039795, "eval_runtime": 80.5896, "eval_samples_per_second": 62.043, "eval_steps_per_second": 15.511, "num_input_tokens_seen": 40625741536, "step": 77500 }, { "epoch": 0.7398309026080112, "grad_norm": 0.15267367660999298, "learning_rate": 0.001, "loss": 2.1358, "num_input_tokens_seen": 40651948224, "step": 77550 }, { "epoch": 0.7403079051241995, "grad_norm": 0.14535802602767944, "learning_rate": 0.001, "loss": 2.1194, "num_input_tokens_seen": 40678158304, "step": 77600 }, { "epoch": 0.7407849076403878, "grad_norm": 0.14069873094558716, "learning_rate": 0.001, "loss": 2.1254, "num_input_tokens_seen": 40704364160, "step": 77650 }, { "epoch": 0.741261910156576, "grad_norm": 0.15348641574382782, "learning_rate": 0.001, "loss": 2.1227, "num_input_tokens_seen": 40730574080, "step": 77700 }, { "epoch": 0.7417389126727644, "grad_norm": 0.13605211675167084, "learning_rate": 0.001, "loss": 2.1319, "num_input_tokens_seen": 40756787808, "step": 77750 }, { "epoch": 0.7422159151889526, "grad_norm": 0.14588411152362823, "learning_rate": 0.001, "loss": 2.17, "num_input_tokens_seen": 40782995104, "step": 77800 }, { "epoch": 0.7426929177051409, "grad_norm": 0.14045777916908264, "learning_rate": 0.001, "loss": 2.1396, "num_input_tokens_seen": 40809206464, "step": 77850 }, { "epoch": 0.7431699202213292, "grad_norm": 0.1325819045305252, "learning_rate": 0.001, "loss": 2.1301, "num_input_tokens_seen": 40835418400, "step": 77900 }, { "epoch": 0.7436469227375174, "grad_norm": 0.14319738745689392, "learning_rate": 0.001, "loss": 2.1249, "num_input_tokens_seen": 40861629600, "step": 77950 }, { "epoch": 0.7441239252537057, "grad_norm": 0.12736602127552032, "learning_rate": 0.001, "loss": 2.1353, "num_input_tokens_seen": 40887844000, "step": 78000 }, { "epoch": 0.7441239252537057, "eval_loss": 2.0413477420806885, "eval_runtime": 80.6499, "eval_samples_per_second": 61.996, "eval_steps_per_second": 15.499, "num_input_tokens_seen": 40887844000, "step": 78000 }, { "epoch": 0.744600927769894, "grad_norm": 0.14694809913635254, "learning_rate": 0.001, "loss": 2.1279, "num_input_tokens_seen": 40914051584, "step": 78050 }, { "epoch": 0.7450779302860823, "grad_norm": 0.13846631348133087, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 40940260192, "step": 78100 }, { "epoch": 0.7455549328022706, "grad_norm": 0.1346752941608429, "learning_rate": 0.001, "loss": 2.1205, "num_input_tokens_seen": 40966469856, "step": 78150 }, { "epoch": 0.7460319353184588, "grad_norm": 0.14688965678215027, "learning_rate": 0.001, "loss": 2.1268, "num_input_tokens_seen": 40992684256, "step": 78200 }, { "epoch": 0.7465089378346471, "grad_norm": 0.13972339034080505, "learning_rate": 0.001, "loss": 2.117, "num_input_tokens_seen": 41018896352, "step": 78250 }, { "epoch": 0.7469859403508353, "grad_norm": 0.13046054542064667, "learning_rate": 0.001, "loss": 2.1325, "num_input_tokens_seen": 41045100768, "step": 78300 }, { "epoch": 0.7474629428670236, "grad_norm": 0.14544983208179474, "learning_rate": 0.001, "loss": 2.1199, "num_input_tokens_seen": 41071312032, "step": 78350 }, { "epoch": 0.7479399453832118, "grad_norm": 0.13829651474952698, "learning_rate": 0.001, "loss": 2.1233, "num_input_tokens_seen": 41097519104, "step": 78400 }, { "epoch": 0.7484169478994002, "grad_norm": 0.13015754520893097, "learning_rate": 0.001, "loss": 2.1273, "num_input_tokens_seen": 41123726848, "step": 78450 }, { "epoch": 0.7488939504155885, "grad_norm": 0.15603971481323242, "learning_rate": 0.001, "loss": 2.1214, "num_input_tokens_seen": 41149941248, "step": 78500 }, { "epoch": 0.7488939504155885, "eval_loss": 2.0389957427978516, "eval_runtime": 80.3381, "eval_samples_per_second": 62.237, "eval_steps_per_second": 15.559, "num_input_tokens_seen": 41149941248, "step": 78500 }, { "epoch": 0.7493709529317767, "grad_norm": 0.142806738615036, "learning_rate": 0.001, "loss": 2.1262, "num_input_tokens_seen": 41176154656, "step": 78550 }, { "epoch": 0.749847955447965, "grad_norm": 0.12840932607650757, "learning_rate": 0.001, "loss": 2.1264, "num_input_tokens_seen": 41202368992, "step": 78600 }, { "epoch": 0.7503249579641532, "grad_norm": 0.14436882734298706, "learning_rate": 0.001, "loss": 2.1164, "num_input_tokens_seen": 41228583392, "step": 78650 }, { "epoch": 0.7508019604803415, "grad_norm": 0.14413060247898102, "learning_rate": 0.001, "loss": 2.1365, "num_input_tokens_seen": 41254796832, "step": 78700 }, { "epoch": 0.7512789629965299, "grad_norm": 0.14348316192626953, "learning_rate": 0.001, "loss": 2.1297, "num_input_tokens_seen": 41281008000, "step": 78750 }, { "epoch": 0.7517559655127181, "grad_norm": 0.1614920049905777, "learning_rate": 0.001, "loss": 2.1274, "num_input_tokens_seen": 41307217120, "step": 78800 }, { "epoch": 0.7522329680289064, "grad_norm": 0.13642476499080658, "learning_rate": 0.001, "loss": 2.1198, "num_input_tokens_seen": 41333429376, "step": 78850 }, { "epoch": 0.7527099705450946, "grad_norm": 0.13858525454998016, "learning_rate": 0.001, "loss": 2.1142, "num_input_tokens_seen": 41359629152, "step": 78900 }, { "epoch": 0.7531869730612829, "grad_norm": 0.1430158019065857, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 41385843552, "step": 78950 }, { "epoch": 0.7536639755774712, "grad_norm": 0.14330270886421204, "learning_rate": 0.001, "loss": 2.1184, "num_input_tokens_seen": 41412054816, "step": 79000 }, { "epoch": 0.7536639755774712, "eval_loss": 2.039459705352783, "eval_runtime": 79.6308, "eval_samples_per_second": 62.79, "eval_steps_per_second": 15.697, "num_input_tokens_seen": 41412054816, "step": 79000 }, { "epoch": 0.7541409780936594, "grad_norm": 0.15824219584465027, "learning_rate": 0.001, "loss": 2.1102, "num_input_tokens_seen": 41438265536, "step": 79050 }, { "epoch": 0.7546179806098477, "grad_norm": 0.14686284959316254, "learning_rate": 0.001, "loss": 2.1192, "num_input_tokens_seen": 41464479360, "step": 79100 }, { "epoch": 0.755094983126036, "grad_norm": 0.15335090458393097, "learning_rate": 0.001, "loss": 2.127, "num_input_tokens_seen": 41490678464, "step": 79150 }, { "epoch": 0.7555719856422243, "grad_norm": 0.15127016603946686, "learning_rate": 0.001, "loss": 2.1114, "num_input_tokens_seen": 41516875424, "step": 79200 }, { "epoch": 0.7560489881584125, "grad_norm": 0.14542415738105774, "learning_rate": 0.001, "loss": 2.1244, "num_input_tokens_seen": 41543082560, "step": 79250 }, { "epoch": 0.7565259906746008, "grad_norm": 0.1413310468196869, "learning_rate": 0.001, "loss": 2.1196, "num_input_tokens_seen": 41569295104, "step": 79300 }, { "epoch": 0.7570029931907891, "grad_norm": 0.15854205191135406, "learning_rate": 0.001, "loss": 2.1185, "num_input_tokens_seen": 41595508128, "step": 79350 }, { "epoch": 0.7574799957069773, "grad_norm": 0.14724323153495789, "learning_rate": 0.001, "loss": 2.1244, "num_input_tokens_seen": 41621721792, "step": 79400 }, { "epoch": 0.7579569982231656, "grad_norm": 0.139704167842865, "learning_rate": 0.001, "loss": 2.1359, "num_input_tokens_seen": 41647928864, "step": 79450 }, { "epoch": 0.7584340007393539, "grad_norm": 0.14574068784713745, "learning_rate": 0.001, "loss": 2.1199, "num_input_tokens_seen": 41674139648, "step": 79500 }, { "epoch": 0.7584340007393539, "eval_loss": 2.0389485359191895, "eval_runtime": 80.0866, "eval_samples_per_second": 62.432, "eval_steps_per_second": 15.608, "num_input_tokens_seen": 41674139648, "step": 79500 }, { "epoch": 0.7589110032555422, "grad_norm": 0.14367084205150604, "learning_rate": 0.001, "loss": 2.123, "num_input_tokens_seen": 41700327904, "step": 79550 }, { "epoch": 0.7593880057717305, "grad_norm": 0.1638575941324234, "learning_rate": 0.001, "loss": 2.1228, "num_input_tokens_seen": 41726541536, "step": 79600 }, { "epoch": 0.7598650082879187, "grad_norm": 0.14226502180099487, "learning_rate": 0.001, "loss": 2.1423, "num_input_tokens_seen": 41752751392, "step": 79650 }, { "epoch": 0.760342010804107, "grad_norm": 0.13502418994903564, "learning_rate": 0.001, "loss": 2.12, "num_input_tokens_seen": 41778958816, "step": 79700 }, { "epoch": 0.7608190133202952, "grad_norm": 0.1341133862733841, "learning_rate": 0.001, "loss": 2.117, "num_input_tokens_seen": 41805163168, "step": 79750 }, { "epoch": 0.7612960158364835, "grad_norm": 0.14015237987041473, "learning_rate": 0.001, "loss": 2.1248, "num_input_tokens_seen": 41831374080, "step": 79800 }, { "epoch": 0.7617730183526719, "grad_norm": 0.14166907966136932, "learning_rate": 0.001, "loss": 2.1307, "num_input_tokens_seen": 41857587424, "step": 79850 }, { "epoch": 0.7622500208688601, "grad_norm": 0.21544745564460754, "learning_rate": 0.001, "loss": 2.1442, "num_input_tokens_seen": 41883794976, "step": 79900 }, { "epoch": 0.7627270233850484, "grad_norm": 0.13640902936458588, "learning_rate": 0.001, "loss": 2.157, "num_input_tokens_seen": 41910007040, "step": 79950 }, { "epoch": 0.7632040259012366, "grad_norm": 0.13672830164432526, "learning_rate": 0.001, "loss": 2.129, "num_input_tokens_seen": 41936220160, "step": 80000 }, { "epoch": 0.7632040259012366, "eval_loss": 2.04022479057312, "eval_runtime": 79.3648, "eval_samples_per_second": 63.0, "eval_steps_per_second": 15.75, "num_input_tokens_seen": 41936220160, "step": 80000 }, { "epoch": 0.7636810284174249, "grad_norm": 0.1411873996257782, "learning_rate": 0.001, "loss": 2.1271, "num_input_tokens_seen": 41962432768, "step": 80050 }, { "epoch": 0.7641580309336131, "grad_norm": 0.14791899919509888, "learning_rate": 0.001, "loss": 2.1298, "num_input_tokens_seen": 41988646144, "step": 80100 }, { "epoch": 0.7646350334498014, "grad_norm": 0.1465454399585724, "learning_rate": 0.001, "loss": 2.1185, "num_input_tokens_seen": 42014860544, "step": 80150 }, { "epoch": 0.7651120359659898, "grad_norm": 0.1528947502374649, "learning_rate": 0.001, "loss": 2.115, "num_input_tokens_seen": 42041067744, "step": 80200 }, { "epoch": 0.765589038482178, "grad_norm": 0.21168603003025055, "learning_rate": 0.001, "loss": 2.1304, "num_input_tokens_seen": 42067282144, "step": 80250 }, { "epoch": 0.7660660409983663, "grad_norm": 0.1547636091709137, "learning_rate": 0.001, "loss": 2.1268, "num_input_tokens_seen": 42093496544, "step": 80300 }, { "epoch": 0.7665430435145545, "grad_norm": 0.1418161541223526, "learning_rate": 0.001, "loss": 2.1273, "num_input_tokens_seen": 42119710944, "step": 80350 }, { "epoch": 0.7670200460307428, "grad_norm": 0.1477021723985672, "learning_rate": 0.001, "loss": 2.1261, "num_input_tokens_seen": 42145920448, "step": 80400 }, { "epoch": 0.7674970485469311, "grad_norm": 0.14230799674987793, "learning_rate": 0.001, "loss": 2.118, "num_input_tokens_seen": 42172131008, "step": 80450 }, { "epoch": 0.7679740510631193, "grad_norm": 0.14658768475055695, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 42198339200, "step": 80500 }, { "epoch": 0.7679740510631193, "eval_loss": 2.0385217666625977, "eval_runtime": 79.7406, "eval_samples_per_second": 62.703, "eval_steps_per_second": 15.676, "num_input_tokens_seen": 42198339200, "step": 80500 }, { "epoch": 0.7684510535793077, "grad_norm": 0.14163857698440552, "learning_rate": 0.001, "loss": 2.1172, "num_input_tokens_seen": 42224552832, "step": 80550 }, { "epoch": 0.7689280560954959, "grad_norm": 0.14124558866024017, "learning_rate": 0.001, "loss": 2.1313, "num_input_tokens_seen": 42250764928, "step": 80600 }, { "epoch": 0.7694050586116842, "grad_norm": 0.14133115112781525, "learning_rate": 0.001, "loss": 2.1207, "num_input_tokens_seen": 42276974400, "step": 80650 }, { "epoch": 0.7698820611278725, "grad_norm": 0.15105165541172028, "learning_rate": 0.001, "loss": 2.1235, "num_input_tokens_seen": 42303188800, "step": 80700 }, { "epoch": 0.7703590636440607, "grad_norm": 0.1437007337808609, "learning_rate": 0.001, "loss": 2.1153, "num_input_tokens_seen": 42329398016, "step": 80750 }, { "epoch": 0.770836066160249, "grad_norm": 0.138414204120636, "learning_rate": 0.001, "loss": 2.1218, "num_input_tokens_seen": 42355602880, "step": 80800 }, { "epoch": 0.7713130686764372, "grad_norm": 0.15313681960105896, "learning_rate": 0.001, "loss": 2.1272, "num_input_tokens_seen": 42381812672, "step": 80850 }, { "epoch": 0.7717900711926255, "grad_norm": 0.1474558264017105, "learning_rate": 0.001, "loss": 2.1233, "num_input_tokens_seen": 42408026560, "step": 80900 }, { "epoch": 0.7722670737088139, "grad_norm": 0.14552924036979675, "learning_rate": 0.001, "loss": 2.1171, "num_input_tokens_seen": 42434238880, "step": 80950 }, { "epoch": 0.7727440762250021, "grad_norm": 0.14388687908649445, "learning_rate": 0.001, "loss": 2.1147, "num_input_tokens_seen": 42460451136, "step": 81000 }, { "epoch": 0.7727440762250021, "eval_loss": 2.0367112159729004, "eval_runtime": 79.262, "eval_samples_per_second": 63.082, "eval_steps_per_second": 15.77, "num_input_tokens_seen": 42460451136, "step": 81000 }, { "epoch": 0.7732210787411904, "grad_norm": 0.14675873517990112, "learning_rate": 0.001, "loss": 2.1202, "num_input_tokens_seen": 42486665536, "step": 81050 }, { "epoch": 0.7736980812573786, "grad_norm": 0.14389154314994812, "learning_rate": 0.001, "loss": 2.1267, "num_input_tokens_seen": 42512873152, "step": 81100 }, { "epoch": 0.7741750837735669, "grad_norm": 0.13341355323791504, "learning_rate": 0.001, "loss": 2.1242, "num_input_tokens_seen": 42539087552, "step": 81150 }, { "epoch": 0.7746520862897551, "grad_norm": 0.14013975858688354, "learning_rate": 0.001, "loss": 2.1256, "num_input_tokens_seen": 42565299392, "step": 81200 }, { "epoch": 0.7751290888059434, "grad_norm": 0.1397426873445511, "learning_rate": 0.001, "loss": 2.1191, "num_input_tokens_seen": 42591510720, "step": 81250 }, { "epoch": 0.7756060913221318, "grad_norm": 0.148366779088974, "learning_rate": 0.001, "loss": 2.121, "num_input_tokens_seen": 42617720416, "step": 81300 }, { "epoch": 0.77608309383832, "grad_norm": 0.14177195727825165, "learning_rate": 0.001, "loss": 2.1253, "num_input_tokens_seen": 42643931360, "step": 81350 }, { "epoch": 0.7765600963545083, "grad_norm": 0.15625756978988647, "learning_rate": 0.001, "loss": 2.1161, "num_input_tokens_seen": 42670145344, "step": 81400 }, { "epoch": 0.7770370988706965, "grad_norm": 0.16141097247600555, "learning_rate": 0.001, "loss": 2.1144, "num_input_tokens_seen": 42696355872, "step": 81450 }, { "epoch": 0.7775141013868848, "grad_norm": 0.14571966230869293, "learning_rate": 0.001, "loss": 2.1265, "num_input_tokens_seen": 42722562592, "step": 81500 }, { "epoch": 0.7775141013868848, "eval_loss": 2.0364601612091064, "eval_runtime": 79.6872, "eval_samples_per_second": 62.745, "eval_steps_per_second": 15.686, "num_input_tokens_seen": 42722562592, "step": 81500 }, { "epoch": 0.7779911039030731, "grad_norm": 0.14065228402614594, "learning_rate": 0.001, "loss": 2.1229, "num_input_tokens_seen": 42748776992, "step": 81550 }, { "epoch": 0.7784681064192613, "grad_norm": 0.13356003165245056, "learning_rate": 0.001, "loss": 2.1122, "num_input_tokens_seen": 42774986944, "step": 81600 }, { "epoch": 0.7789451089354497, "grad_norm": 0.1398439258337021, "learning_rate": 0.001, "loss": 2.1304, "num_input_tokens_seen": 42801193248, "step": 81650 }, { "epoch": 0.7794221114516379, "grad_norm": 0.14399580657482147, "learning_rate": 0.001, "loss": 2.1191, "num_input_tokens_seen": 42827397664, "step": 81700 }, { "epoch": 0.7798991139678262, "grad_norm": 0.1511550098657608, "learning_rate": 0.001, "loss": 2.1299, "num_input_tokens_seen": 42853609184, "step": 81750 }, { "epoch": 0.7803761164840145, "grad_norm": 0.13643226027488708, "learning_rate": 0.001, "loss": 2.1176, "num_input_tokens_seen": 42879823584, "step": 81800 }, { "epoch": 0.7808531190002027, "grad_norm": 0.15320724248886108, "learning_rate": 0.001, "loss": 2.1268, "num_input_tokens_seen": 42906032096, "step": 81850 }, { "epoch": 0.781330121516391, "grad_norm": 0.15477311611175537, "learning_rate": 0.001, "loss": 2.1173, "num_input_tokens_seen": 42932238784, "step": 81900 }, { "epoch": 0.7818071240325792, "grad_norm": 0.1393759399652481, "learning_rate": 0.001, "loss": 2.1148, "num_input_tokens_seen": 42958444704, "step": 81950 }, { "epoch": 0.7822841265487676, "grad_norm": 0.14024987816810608, "learning_rate": 0.001, "loss": 2.1254, "num_input_tokens_seen": 42984652928, "step": 82000 }, { "epoch": 0.7822841265487676, "eval_loss": 2.035764694213867, "eval_runtime": 80.5252, "eval_samples_per_second": 62.092, "eval_steps_per_second": 15.523, "num_input_tokens_seen": 42984652928, "step": 82000 }, { "epoch": 0.7827611290649558, "grad_norm": 0.14049945771694183, "learning_rate": 0.001, "loss": 2.1148, "num_input_tokens_seen": 43010864736, "step": 82050 }, { "epoch": 0.7832381315811441, "grad_norm": 0.14106184244155884, "learning_rate": 0.001, "loss": 2.127, "num_input_tokens_seen": 43037075424, "step": 82100 }, { "epoch": 0.7837151340973324, "grad_norm": 0.153049498796463, "learning_rate": 0.001, "loss": 2.1243, "num_input_tokens_seen": 43063287616, "step": 82150 }, { "epoch": 0.7841921366135206, "grad_norm": 0.13522286713123322, "learning_rate": 0.001, "loss": 2.1202, "num_input_tokens_seen": 43089496256, "step": 82200 }, { "epoch": 0.7846691391297089, "grad_norm": 0.1484510898590088, "learning_rate": 0.001, "loss": 2.1174, "num_input_tokens_seen": 43115709216, "step": 82250 }, { "epoch": 0.7851461416458971, "grad_norm": 0.13766340911388397, "learning_rate": 0.001, "loss": 2.1113, "num_input_tokens_seen": 43141917216, "step": 82300 }, { "epoch": 0.7856231441620855, "grad_norm": 0.15031974017620087, "learning_rate": 0.001, "loss": 2.1122, "num_input_tokens_seen": 43168114496, "step": 82350 }, { "epoch": 0.7861001466782738, "grad_norm": 0.1554354727268219, "learning_rate": 0.001, "loss": 2.1179, "num_input_tokens_seen": 43194316224, "step": 82400 }, { "epoch": 0.786577149194462, "grad_norm": 0.1402343511581421, "learning_rate": 0.001, "loss": 2.1107, "num_input_tokens_seen": 43220521024, "step": 82450 }, { "epoch": 0.7870541517106503, "grad_norm": 0.1455002725124359, "learning_rate": 0.001, "loss": 2.1121, "num_input_tokens_seen": 43246727552, "step": 82500 }, { "epoch": 0.7870541517106503, "eval_loss": 2.0353808403015137, "eval_runtime": 79.7931, "eval_samples_per_second": 62.662, "eval_steps_per_second": 15.666, "num_input_tokens_seen": 43246727552, "step": 82500 }, { "epoch": 0.7875311542268385, "grad_norm": 0.13784560561180115, "learning_rate": 0.001, "loss": 2.1218, "num_input_tokens_seen": 43272941952, "step": 82550 }, { "epoch": 0.7880081567430268, "grad_norm": 0.16629286110401154, "learning_rate": 0.001, "loss": 2.1257, "num_input_tokens_seen": 43299148416, "step": 82600 }, { "epoch": 0.7884851592592151, "grad_norm": 0.14138463139533997, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 43325357152, "step": 82650 }, { "epoch": 0.7889621617754033, "grad_norm": 0.1476413905620575, "learning_rate": 0.001, "loss": 2.1261, "num_input_tokens_seen": 43351571552, "step": 82700 }, { "epoch": 0.7894391642915917, "grad_norm": 0.14458167552947998, "learning_rate": 0.001, "loss": 2.1186, "num_input_tokens_seen": 43377781824, "step": 82750 }, { "epoch": 0.7899161668077799, "grad_norm": 0.14135339856147766, "learning_rate": 0.001, "loss": 2.1145, "num_input_tokens_seen": 43403986336, "step": 82800 }, { "epoch": 0.7903931693239682, "grad_norm": 0.1399611085653305, "learning_rate": 0.001, "loss": 2.1116, "num_input_tokens_seen": 43430200736, "step": 82850 }, { "epoch": 0.7908701718401565, "grad_norm": 0.14598487317562103, "learning_rate": 0.001, "loss": 2.1219, "num_input_tokens_seen": 43456414848, "step": 82900 }, { "epoch": 0.7913471743563447, "grad_norm": 0.14239011704921722, "learning_rate": 0.001, "loss": 2.1049, "num_input_tokens_seen": 43482628512, "step": 82950 }, { "epoch": 0.791824176872533, "grad_norm": 0.1495833843946457, "learning_rate": 0.001, "loss": 2.1235, "num_input_tokens_seen": 43508842912, "step": 83000 }, { "epoch": 0.791824176872533, "eval_loss": 2.0351996421813965, "eval_runtime": 80.7069, "eval_samples_per_second": 61.953, "eval_steps_per_second": 15.488, "num_input_tokens_seen": 43508842912, "step": 83000 }, { "epoch": 0.7923011793887212, "grad_norm": 0.14430342614650726, "learning_rate": 0.001, "loss": 2.1139, "num_input_tokens_seen": 43535053536, "step": 83050 }, { "epoch": 0.7927781819049096, "grad_norm": 0.14061962068080902, "learning_rate": 0.001, "loss": 2.1227, "num_input_tokens_seen": 43561267936, "step": 83100 }, { "epoch": 0.7932551844210978, "grad_norm": 0.14604133367538452, "learning_rate": 0.001, "loss": 2.1224, "num_input_tokens_seen": 43587477408, "step": 83150 }, { "epoch": 0.7937321869372861, "grad_norm": 0.1432175487279892, "learning_rate": 0.001, "loss": 2.1277, "num_input_tokens_seen": 43613684960, "step": 83200 }, { "epoch": 0.7942091894534744, "grad_norm": 0.15234777331352234, "learning_rate": 0.001, "loss": 2.1082, "num_input_tokens_seen": 43639894112, "step": 83250 }, { "epoch": 0.7946861919696626, "grad_norm": 0.14436079561710358, "learning_rate": 0.001, "loss": 2.1207, "num_input_tokens_seen": 43666103104, "step": 83300 }, { "epoch": 0.7951631944858509, "grad_norm": 0.14395667612552643, "learning_rate": 0.001, "loss": 2.1219, "num_input_tokens_seen": 43692313184, "step": 83350 }, { "epoch": 0.7956401970020391, "grad_norm": 0.13969875872135162, "learning_rate": 0.001, "loss": 2.1152, "num_input_tokens_seen": 43718525536, "step": 83400 }, { "epoch": 0.7961171995182275, "grad_norm": 0.151366725564003, "learning_rate": 0.001, "loss": 2.1168, "num_input_tokens_seen": 43744737728, "step": 83450 }, { "epoch": 0.7965942020344158, "grad_norm": 0.13248160481452942, "learning_rate": 0.001, "loss": 2.1192, "num_input_tokens_seen": 43770947360, "step": 83500 }, { "epoch": 0.7965942020344158, "eval_loss": 2.0351762771606445, "eval_runtime": 79.3184, "eval_samples_per_second": 63.037, "eval_steps_per_second": 15.759, "num_input_tokens_seen": 43770947360, "step": 83500 }, { "epoch": 0.797071204550604, "grad_norm": 0.14381654560565948, "learning_rate": 0.001, "loss": 2.1128, "num_input_tokens_seen": 43797158272, "step": 83550 }, { "epoch": 0.7975482070667923, "grad_norm": 0.1607636660337448, "learning_rate": 0.001, "loss": 2.1185, "num_input_tokens_seen": 43823371904, "step": 83600 }, { "epoch": 0.7980252095829805, "grad_norm": 0.1534896194934845, "learning_rate": 0.001, "loss": 2.1191, "num_input_tokens_seen": 43849584512, "step": 83650 }, { "epoch": 0.7985022120991688, "grad_norm": 0.1401808112859726, "learning_rate": 0.001, "loss": 2.1228, "num_input_tokens_seen": 43875796832, "step": 83700 }, { "epoch": 0.7989792146153571, "grad_norm": 0.15275578200817108, "learning_rate": 0.001, "loss": 2.1153, "num_input_tokens_seen": 43902011232, "step": 83750 }, { "epoch": 0.7994562171315454, "grad_norm": 0.13924409449100494, "learning_rate": 0.001, "loss": 2.1208, "num_input_tokens_seen": 43928216352, "step": 83800 }, { "epoch": 0.7999332196477337, "grad_norm": 0.18342813849449158, "learning_rate": 0.001, "loss": 2.1694, "num_input_tokens_seen": 43954427648, "step": 83850 }, { "epoch": 0.8004102221639219, "grad_norm": 0.14373578131198883, "learning_rate": 0.001, "loss": 2.1333, "num_input_tokens_seen": 43980640928, "step": 83900 }, { "epoch": 0.8008872246801102, "grad_norm": 0.1297065019607544, "learning_rate": 0.001, "loss": 2.134, "num_input_tokens_seen": 44006842240, "step": 83950 }, { "epoch": 0.8013642271962984, "grad_norm": 0.14140845835208893, "learning_rate": 0.001, "loss": 2.1303, "num_input_tokens_seen": 44033053408, "step": 84000 }, { "epoch": 0.8013642271962984, "eval_loss": 2.038163185119629, "eval_runtime": 80.0722, "eval_samples_per_second": 62.444, "eval_steps_per_second": 15.611, "num_input_tokens_seen": 44033053408, "step": 84000 }, { "epoch": 0.8018412297124867, "grad_norm": 0.13576118648052216, "learning_rate": 0.001, "loss": 2.1238, "num_input_tokens_seen": 44059265280, "step": 84050 }, { "epoch": 0.802318232228675, "grad_norm": 0.14275366067886353, "learning_rate": 0.001, "loss": 2.1173, "num_input_tokens_seen": 44085474144, "step": 84100 }, { "epoch": 0.8027952347448633, "grad_norm": 0.1358124315738678, "learning_rate": 0.001, "loss": 2.1174, "num_input_tokens_seen": 44111683840, "step": 84150 }, { "epoch": 0.8032722372610516, "grad_norm": 0.14228695631027222, "learning_rate": 0.001, "loss": 2.1142, "num_input_tokens_seen": 44137898240, "step": 84200 }, { "epoch": 0.8037492397772398, "grad_norm": 0.13854092359542847, "learning_rate": 0.001, "loss": 2.1099, "num_input_tokens_seen": 44164112544, "step": 84250 }, { "epoch": 0.8042262422934281, "grad_norm": 0.13311374187469482, "learning_rate": 0.001, "loss": 2.1177, "num_input_tokens_seen": 44190326560, "step": 84300 }, { "epoch": 0.8047032448096164, "grad_norm": 0.13964787125587463, "learning_rate": 0.001, "loss": 2.122, "num_input_tokens_seen": 44216539296, "step": 84350 }, { "epoch": 0.8051802473258046, "grad_norm": 0.13378119468688965, "learning_rate": 0.001, "loss": 2.1147, "num_input_tokens_seen": 44242753696, "step": 84400 }, { "epoch": 0.8056572498419929, "grad_norm": 0.1523156464099884, "learning_rate": 0.001, "loss": 2.1218, "num_input_tokens_seen": 44268968096, "step": 84450 }, { "epoch": 0.8061342523581811, "grad_norm": 0.14103132486343384, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 44295182016, "step": 84500 }, { "epoch": 0.8061342523581811, "eval_loss": 2.033738374710083, "eval_runtime": 79.8888, "eval_samples_per_second": 62.587, "eval_steps_per_second": 15.647, "num_input_tokens_seen": 44295182016, "step": 84500 }, { "epoch": 0.8066112548743695, "grad_norm": 0.14938974380493164, "learning_rate": 0.001, "loss": 2.1105, "num_input_tokens_seen": 44321393184, "step": 84550 }, { "epoch": 0.8070882573905578, "grad_norm": 0.15471114218235016, "learning_rate": 0.001, "loss": 2.124, "num_input_tokens_seen": 44347607584, "step": 84600 }, { "epoch": 0.807565259906746, "grad_norm": 0.14569403231143951, "learning_rate": 0.001, "loss": 2.1155, "num_input_tokens_seen": 44373816224, "step": 84650 }, { "epoch": 0.8080422624229343, "grad_norm": 0.1456989198923111, "learning_rate": 0.001, "loss": 2.1251, "num_input_tokens_seen": 44400021216, "step": 84700 }, { "epoch": 0.8085192649391225, "grad_norm": 0.1367526352405548, "learning_rate": 0.001, "loss": 2.1146, "num_input_tokens_seen": 44426223008, "step": 84750 }, { "epoch": 0.8089962674553108, "grad_norm": 0.14826616644859314, "learning_rate": 0.001, "loss": 2.1185, "num_input_tokens_seen": 44452436000, "step": 84800 }, { "epoch": 0.8094732699714992, "grad_norm": 0.15011751651763916, "learning_rate": 0.001, "loss": 2.1281, "num_input_tokens_seen": 44478650368, "step": 84850 }, { "epoch": 0.8099502724876874, "grad_norm": 0.14236512780189514, "learning_rate": 0.001, "loss": 2.1205, "num_input_tokens_seen": 44504864768, "step": 84900 }, { "epoch": 0.8104272750038757, "grad_norm": 0.14300031960010529, "learning_rate": 0.001, "loss": 2.1319, "num_input_tokens_seen": 44531075200, "step": 84950 }, { "epoch": 0.8109042775200639, "grad_norm": 0.13161155581474304, "learning_rate": 0.001, "loss": 2.125, "num_input_tokens_seen": 44557289600, "step": 85000 }, { "epoch": 0.8109042775200639, "eval_loss": 2.032745599746704, "eval_runtime": 80.1389, "eval_samples_per_second": 62.392, "eval_steps_per_second": 15.598, "num_input_tokens_seen": 44557289600, "step": 85000 }, { "epoch": 0.8113812800362522, "grad_norm": 0.15233619511127472, "learning_rate": 0.001, "loss": 2.1097, "num_input_tokens_seen": 44583500576, "step": 85050 }, { "epoch": 0.8118582825524404, "grad_norm": 0.15628154575824738, "learning_rate": 0.001, "loss": 2.1166, "num_input_tokens_seen": 44609714112, "step": 85100 }, { "epoch": 0.8123352850686287, "grad_norm": 0.15254861116409302, "learning_rate": 0.001, "loss": 2.1243, "num_input_tokens_seen": 44635927648, "step": 85150 }, { "epoch": 0.812812287584817, "grad_norm": 0.1345020979642868, "learning_rate": 0.001, "loss": 2.1265, "num_input_tokens_seen": 44662141408, "step": 85200 }, { "epoch": 0.8132892901010053, "grad_norm": 0.14372238516807556, "learning_rate": 0.001, "loss": 2.1238, "num_input_tokens_seen": 44688355808, "step": 85250 }, { "epoch": 0.8137662926171936, "grad_norm": 0.14160767197608948, "learning_rate": 0.001, "loss": 2.1276, "num_input_tokens_seen": 44714565152, "step": 85300 }, { "epoch": 0.8142432951333818, "grad_norm": 0.15178006887435913, "learning_rate": 0.001, "loss": 2.1171, "num_input_tokens_seen": 44740776032, "step": 85350 }, { "epoch": 0.8147202976495701, "grad_norm": 0.1339855045080185, "learning_rate": 0.001, "loss": 2.0985, "num_input_tokens_seen": 44766988608, "step": 85400 }, { "epoch": 0.8151973001657584, "grad_norm": 0.1480085402727127, "learning_rate": 0.001, "loss": 2.1102, "num_input_tokens_seen": 44793200896, "step": 85450 }, { "epoch": 0.8156743026819466, "grad_norm": 0.18293645977973938, "learning_rate": 0.001, "loss": 2.1218, "num_input_tokens_seen": 44819414752, "step": 85500 }, { "epoch": 0.8156743026819466, "eval_loss": 2.031679630279541, "eval_runtime": 79.5824, "eval_samples_per_second": 62.828, "eval_steps_per_second": 15.707, "num_input_tokens_seen": 44819414752, "step": 85500 }, { "epoch": 0.816151305198135, "grad_norm": 0.14524328708648682, "learning_rate": 0.001, "loss": 2.1085, "num_input_tokens_seen": 44845612704, "step": 85550 }, { "epoch": 0.8166283077143232, "grad_norm": 0.14643549919128418, "learning_rate": 0.001, "loss": 2.1151, "num_input_tokens_seen": 44871822016, "step": 85600 }, { "epoch": 0.8171053102305115, "grad_norm": 0.14329664409160614, "learning_rate": 0.001, "loss": 2.1226, "num_input_tokens_seen": 44898035456, "step": 85650 }, { "epoch": 0.8175823127466998, "grad_norm": 0.14474226534366608, "learning_rate": 0.001, "loss": 2.109, "num_input_tokens_seen": 44924248160, "step": 85700 }, { "epoch": 0.818059315262888, "grad_norm": 0.15638591349124908, "learning_rate": 0.001, "loss": 2.1111, "num_input_tokens_seen": 44950456736, "step": 85750 }, { "epoch": 0.8185363177790763, "grad_norm": 0.14359885454177856, "learning_rate": 0.001, "loss": 2.1123, "num_input_tokens_seen": 44976671136, "step": 85800 }, { "epoch": 0.8190133202952645, "grad_norm": 0.14419220387935638, "learning_rate": 0.001, "loss": 2.1071, "num_input_tokens_seen": 45002877984, "step": 85850 }, { "epoch": 0.8194903228114528, "grad_norm": 0.1485709697008133, "learning_rate": 0.001, "loss": 2.1094, "num_input_tokens_seen": 45029088992, "step": 85900 }, { "epoch": 0.819967325327641, "grad_norm": 0.14082056283950806, "learning_rate": 0.001, "loss": 2.1182, "num_input_tokens_seen": 45055300768, "step": 85950 }, { "epoch": 0.8204443278438294, "grad_norm": 0.13490447402000427, "learning_rate": 0.001, "loss": 2.1122, "num_input_tokens_seen": 45081505024, "step": 86000 }, { "epoch": 0.8204443278438294, "eval_loss": 2.0312297344207764, "eval_runtime": 80.2167, "eval_samples_per_second": 62.331, "eval_steps_per_second": 15.583, "num_input_tokens_seen": 45081505024, "step": 86000 }, { "epoch": 0.8209213303600177, "grad_norm": 0.14423319697380066, "learning_rate": 0.001, "loss": 2.1117, "num_input_tokens_seen": 45107719424, "step": 86050 }, { "epoch": 0.8213983328762059, "grad_norm": 0.14507217705249786, "learning_rate": 0.001, "loss": 2.1116, "num_input_tokens_seen": 45133929184, "step": 86100 }, { "epoch": 0.8218753353923942, "grad_norm": 0.14613692462444305, "learning_rate": 0.001, "loss": 2.1068, "num_input_tokens_seen": 45160135904, "step": 86150 }, { "epoch": 0.8223523379085824, "grad_norm": 0.15299580991268158, "learning_rate": 0.001, "loss": 2.1134, "num_input_tokens_seen": 45186346976, "step": 86200 }, { "epoch": 0.8228293404247707, "grad_norm": 0.16637030243873596, "learning_rate": 0.001, "loss": 2.116, "num_input_tokens_seen": 45212552576, "step": 86250 }, { "epoch": 0.8233063429409591, "grad_norm": 0.14588510990142822, "learning_rate": 0.001, "loss": 2.1157, "num_input_tokens_seen": 45238766976, "step": 86300 }, { "epoch": 0.8237833454571473, "grad_norm": 0.1367158144712448, "learning_rate": 0.001, "loss": 2.1215, "num_input_tokens_seen": 45264975488, "step": 86350 }, { "epoch": 0.8242603479733356, "grad_norm": 0.14144419133663177, "learning_rate": 0.001, "loss": 2.1124, "num_input_tokens_seen": 45291175360, "step": 86400 }, { "epoch": 0.8247373504895238, "grad_norm": 0.1478971391916275, "learning_rate": 0.001, "loss": 2.1151, "num_input_tokens_seen": 45317382912, "step": 86450 }, { "epoch": 0.8252143530057121, "grad_norm": 0.14577680826187134, "learning_rate": 0.001, "loss": 2.1293, "num_input_tokens_seen": 45343592896, "step": 86500 }, { "epoch": 0.8252143530057121, "eval_loss": 2.0312957763671875, "eval_runtime": 80.127, "eval_samples_per_second": 62.401, "eval_steps_per_second": 15.6, "num_input_tokens_seen": 45343592896, "step": 86500 }, { "epoch": 0.8256913555219004, "grad_norm": 0.1390218287706375, "learning_rate": 0.001, "loss": 2.1133, "num_input_tokens_seen": 45369805792, "step": 86550 }, { "epoch": 0.8261683580380886, "grad_norm": 0.1416807472705841, "learning_rate": 0.001, "loss": 2.1144, "num_input_tokens_seen": 45396019232, "step": 86600 }, { "epoch": 0.826645360554277, "grad_norm": 0.1379116177558899, "learning_rate": 0.001, "loss": 2.1105, "num_input_tokens_seen": 45422224160, "step": 86650 }, { "epoch": 0.8271223630704652, "grad_norm": 0.13901980221271515, "learning_rate": 0.001, "loss": 2.1092, "num_input_tokens_seen": 45448438560, "step": 86700 }, { "epoch": 0.8275993655866535, "grad_norm": 0.14398328959941864, "learning_rate": 0.001, "loss": 2.1071, "num_input_tokens_seen": 45474652960, "step": 86750 }, { "epoch": 0.8280763681028418, "grad_norm": 0.14946867525577545, "learning_rate": 0.001, "loss": 2.1299, "num_input_tokens_seen": 45500863392, "step": 86800 }, { "epoch": 0.82855337061903, "grad_norm": 0.14332331717014313, "learning_rate": 0.001, "loss": 2.1204, "num_input_tokens_seen": 45527074752, "step": 86850 }, { "epoch": 0.8290303731352183, "grad_norm": 0.15574564039707184, "learning_rate": 0.001, "loss": 2.1226, "num_input_tokens_seen": 45553284800, "step": 86900 }, { "epoch": 0.8295073756514065, "grad_norm": 0.12894290685653687, "learning_rate": 0.001, "loss": 2.1195, "num_input_tokens_seen": 45579494208, "step": 86950 }, { "epoch": 0.8299843781675948, "grad_norm": 0.14729012548923492, "learning_rate": 0.001, "loss": 2.1161, "num_input_tokens_seen": 45605708608, "step": 87000 }, { "epoch": 0.8299843781675948, "eval_loss": 2.0334672927856445, "eval_runtime": 80.1275, "eval_samples_per_second": 62.401, "eval_steps_per_second": 15.6, "num_input_tokens_seen": 45605708608, "step": 87000 }, { "epoch": 0.8304613806837831, "grad_norm": 0.1420363485813141, "learning_rate": 0.001, "loss": 2.1129, "num_input_tokens_seen": 45631921280, "step": 87050 }, { "epoch": 0.8309383831999714, "grad_norm": 0.15057820081710815, "learning_rate": 0.001, "loss": 2.1131, "num_input_tokens_seen": 45658134272, "step": 87100 }, { "epoch": 0.8314153857161597, "grad_norm": 0.1428225189447403, "learning_rate": 0.001, "loss": 2.0981, "num_input_tokens_seen": 45684341440, "step": 87150 }, { "epoch": 0.8318923882323479, "grad_norm": 0.1462431401014328, "learning_rate": 0.001, "loss": 2.1211, "num_input_tokens_seen": 45710546944, "step": 87200 }, { "epoch": 0.8323693907485362, "grad_norm": 0.14011114835739136, "learning_rate": 0.001, "loss": 2.1104, "num_input_tokens_seen": 45736761344, "step": 87250 }, { "epoch": 0.8328463932647244, "grad_norm": 0.14002341032028198, "learning_rate": 0.001, "loss": 2.1158, "num_input_tokens_seen": 45762972928, "step": 87300 }, { "epoch": 0.8333233957809127, "grad_norm": 0.14873993396759033, "learning_rate": 0.001, "loss": 2.116, "num_input_tokens_seen": 45789185152, "step": 87350 }, { "epoch": 0.8338003982971011, "grad_norm": 0.15025608241558075, "learning_rate": 0.001, "loss": 2.1213, "num_input_tokens_seen": 45815394784, "step": 87400 }, { "epoch": 0.8342774008132893, "grad_norm": 0.23774513602256775, "learning_rate": 0.001, "loss": 2.118, "num_input_tokens_seen": 45841605312, "step": 87450 }, { "epoch": 0.8347544033294776, "grad_norm": 0.15170574188232422, "learning_rate": 0.001, "loss": 2.1237, "num_input_tokens_seen": 45867808448, "step": 87500 }, { "epoch": 0.8347544033294776, "eval_loss": 2.036484718322754, "eval_runtime": 79.7877, "eval_samples_per_second": 62.666, "eval_steps_per_second": 15.667, "num_input_tokens_seen": 45867808448, "step": 87500 }, { "epoch": 0.8352314058456658, "grad_norm": 0.14227113127708435, "learning_rate": 0.001, "loss": 2.1245, "num_input_tokens_seen": 45894014560, "step": 87550 }, { "epoch": 0.8357084083618541, "grad_norm": 0.13852350413799286, "learning_rate": 0.001, "loss": 2.1221, "num_input_tokens_seen": 45920222752, "step": 87600 }, { "epoch": 0.8361854108780424, "grad_norm": 0.1606854796409607, "learning_rate": 0.001, "loss": 2.1237, "num_input_tokens_seen": 45946435392, "step": 87650 }, { "epoch": 0.8366624133942306, "grad_norm": 0.13357940316200256, "learning_rate": 0.001, "loss": 2.1191, "num_input_tokens_seen": 45972631840, "step": 87700 }, { "epoch": 0.837139415910419, "grad_norm": 0.1375136822462082, "learning_rate": 0.001, "loss": 2.1137, "num_input_tokens_seen": 45998825152, "step": 87750 }, { "epoch": 0.8376164184266072, "grad_norm": 0.14692631363868713, "learning_rate": 0.001, "loss": 2.1137, "num_input_tokens_seen": 46025038752, "step": 87800 }, { "epoch": 0.8380934209427955, "grad_norm": 0.1487261950969696, "learning_rate": 0.001, "loss": 2.12, "num_input_tokens_seen": 46051252512, "step": 87850 }, { "epoch": 0.8385704234589837, "grad_norm": 0.13279280066490173, "learning_rate": 0.001, "loss": 2.1101, "num_input_tokens_seen": 46077466688, "step": 87900 }, { "epoch": 0.839047425975172, "grad_norm": 0.17393696308135986, "learning_rate": 0.001, "loss": 2.1021, "num_input_tokens_seen": 46103677312, "step": 87950 }, { "epoch": 0.8395244284913603, "grad_norm": 0.13198982179164886, "learning_rate": 0.001, "loss": 2.1206, "num_input_tokens_seen": 46129884928, "step": 88000 }, { "epoch": 0.8395244284913603, "eval_loss": 2.031299591064453, "eval_runtime": 79.9006, "eval_samples_per_second": 62.578, "eval_steps_per_second": 15.644, "num_input_tokens_seen": 46129884928, "step": 88000 }, { "epoch": 0.8400014310075485, "grad_norm": 0.1394035667181015, "learning_rate": 0.001, "loss": 2.1158, "num_input_tokens_seen": 46156095744, "step": 88050 }, { "epoch": 0.8404784335237369, "grad_norm": 0.13993045687675476, "learning_rate": 0.001, "loss": 2.1124, "num_input_tokens_seen": 46182303680, "step": 88100 }, { "epoch": 0.8409554360399251, "grad_norm": 0.13323958218097687, "learning_rate": 0.001, "loss": 2.1096, "num_input_tokens_seen": 46208506784, "step": 88150 }, { "epoch": 0.8414324385561134, "grad_norm": 0.15271630883216858, "learning_rate": 0.001, "loss": 2.1128, "num_input_tokens_seen": 46234719360, "step": 88200 }, { "epoch": 0.8419094410723017, "grad_norm": 0.14392182230949402, "learning_rate": 0.001, "loss": 2.1105, "num_input_tokens_seen": 46260917728, "step": 88250 }, { "epoch": 0.8423864435884899, "grad_norm": 0.14050635695457458, "learning_rate": 0.001, "loss": 2.1149, "num_input_tokens_seen": 46287129376, "step": 88300 }, { "epoch": 0.8428634461046782, "grad_norm": 0.15242235362529755, "learning_rate": 0.001, "loss": 2.1255, "num_input_tokens_seen": 46313338304, "step": 88350 }, { "epoch": 0.8433404486208664, "grad_norm": 0.1493886262178421, "learning_rate": 0.001, "loss": 2.118, "num_input_tokens_seen": 46339548736, "step": 88400 }, { "epoch": 0.8438174511370548, "grad_norm": 0.1382344514131546, "learning_rate": 0.001, "loss": 2.1152, "num_input_tokens_seen": 46365752704, "step": 88450 }, { "epoch": 0.8442944536532431, "grad_norm": 0.15339982509613037, "learning_rate": 0.001, "loss": 2.1191, "num_input_tokens_seen": 46391967104, "step": 88500 }, { "epoch": 0.8442944536532431, "eval_loss": 2.0305092334747314, "eval_runtime": 79.8949, "eval_samples_per_second": 62.582, "eval_steps_per_second": 15.646, "num_input_tokens_seen": 46391967104, "step": 88500 }, { "epoch": 0.8447714561694313, "grad_norm": 0.1537119597196579, "learning_rate": 0.001, "loss": 2.1037, "num_input_tokens_seen": 46418173888, "step": 88550 }, { "epoch": 0.8452484586856196, "grad_norm": 0.14273403584957123, "learning_rate": 0.001, "loss": 2.1112, "num_input_tokens_seen": 46444386848, "step": 88600 }, { "epoch": 0.8457254612018078, "grad_norm": 0.13516731560230255, "learning_rate": 0.001, "loss": 2.1163, "num_input_tokens_seen": 46470594016, "step": 88650 }, { "epoch": 0.8462024637179961, "grad_norm": 0.1436593383550644, "learning_rate": 0.001, "loss": 2.1127, "num_input_tokens_seen": 46496808416, "step": 88700 }, { "epoch": 0.8466794662341843, "grad_norm": 0.14031122624874115, "learning_rate": 0.001, "loss": 2.1108, "num_input_tokens_seen": 46523019488, "step": 88750 }, { "epoch": 0.8471564687503726, "grad_norm": 0.15727658569812775, "learning_rate": 0.001, "loss": 2.1129, "num_input_tokens_seen": 46549230464, "step": 88800 }, { "epoch": 0.847633471266561, "grad_norm": 0.13983725011348724, "learning_rate": 0.001, "loss": 2.1146, "num_input_tokens_seen": 46575440160, "step": 88850 }, { "epoch": 0.8481104737827492, "grad_norm": 0.14959338307380676, "learning_rate": 0.001, "loss": 2.1149, "num_input_tokens_seen": 46601647200, "step": 88900 }, { "epoch": 0.8485874762989375, "grad_norm": 0.1365756243467331, "learning_rate": 0.001, "loss": 2.1123, "num_input_tokens_seen": 46627859680, "step": 88950 }, { "epoch": 0.8490644788151257, "grad_norm": 0.14246727526187897, "learning_rate": 0.001, "loss": 2.1075, "num_input_tokens_seen": 46654069856, "step": 89000 }, { "epoch": 0.8490644788151257, "eval_loss": 2.0287179946899414, "eval_runtime": 80.2662, "eval_samples_per_second": 62.293, "eval_steps_per_second": 15.573, "num_input_tokens_seen": 46654069856, "step": 89000 }, { "epoch": 0.849541481331314, "grad_norm": 0.15061074495315552, "learning_rate": 0.001, "loss": 2.1186, "num_input_tokens_seen": 46680279040, "step": 89050 }, { "epoch": 0.8500184838475023, "grad_norm": 0.16355903446674347, "learning_rate": 0.001, "loss": 2.1159, "num_input_tokens_seen": 46706491232, "step": 89100 }, { "epoch": 0.8504954863636905, "grad_norm": 0.14321939647197723, "learning_rate": 0.001, "loss": 2.115, "num_input_tokens_seen": 46732690880, "step": 89150 }, { "epoch": 0.8509724888798789, "grad_norm": 0.15475858747959137, "learning_rate": 0.001, "loss": 2.1201, "num_input_tokens_seen": 46758901536, "step": 89200 }, { "epoch": 0.8514494913960671, "grad_norm": 0.14698758721351624, "learning_rate": 0.001, "loss": 2.1247, "num_input_tokens_seen": 46785111584, "step": 89250 }, { "epoch": 0.8519264939122554, "grad_norm": 0.13974907994270325, "learning_rate": 0.001, "loss": 2.1172, "num_input_tokens_seen": 46811325984, "step": 89300 }, { "epoch": 0.8524034964284437, "grad_norm": 0.1385921984910965, "learning_rate": 0.001, "loss": 2.1104, "num_input_tokens_seen": 46837523616, "step": 89350 }, { "epoch": 0.8528804989446319, "grad_norm": 0.1460406333208084, "learning_rate": 0.001, "loss": 2.1144, "num_input_tokens_seen": 46863725792, "step": 89400 }, { "epoch": 0.8533575014608202, "grad_norm": 0.1514638215303421, "learning_rate": 0.001, "loss": 2.1061, "num_input_tokens_seen": 46889930432, "step": 89450 }, { "epoch": 0.8538345039770084, "grad_norm": 0.15317507088184357, "learning_rate": 0.001, "loss": 2.1109, "num_input_tokens_seen": 46916134592, "step": 89500 }, { "epoch": 0.8538345039770084, "eval_loss": 2.0291860103607178, "eval_runtime": 79.9263, "eval_samples_per_second": 62.558, "eval_steps_per_second": 15.639, "num_input_tokens_seen": 46916134592, "step": 89500 }, { "epoch": 0.8543115064931968, "grad_norm": 0.14707760512828827, "learning_rate": 0.001, "loss": 2.1161, "num_input_tokens_seen": 46942348992, "step": 89550 }, { "epoch": 0.8547885090093851, "grad_norm": 0.13737894594669342, "learning_rate": 0.001, "loss": 2.106, "num_input_tokens_seen": 46968561760, "step": 89600 }, { "epoch": 0.8552655115255733, "grad_norm": 0.13339094817638397, "learning_rate": 0.001, "loss": 2.1091, "num_input_tokens_seen": 46994765888, "step": 89650 }, { "epoch": 0.8557425140417616, "grad_norm": 0.14085884392261505, "learning_rate": 0.001, "loss": 2.1081, "num_input_tokens_seen": 47020978208, "step": 89700 }, { "epoch": 0.8562195165579498, "grad_norm": 0.13842567801475525, "learning_rate": 0.001, "loss": 2.1173, "num_input_tokens_seen": 47047190592, "step": 89750 }, { "epoch": 0.8566965190741381, "grad_norm": 0.13960140943527222, "learning_rate": 0.001, "loss": 2.1069, "num_input_tokens_seen": 47073403584, "step": 89800 }, { "epoch": 0.8571735215903263, "grad_norm": 0.1304618865251541, "learning_rate": 0.001, "loss": 2.1108, "num_input_tokens_seen": 47099616384, "step": 89850 }, { "epoch": 0.8576505241065147, "grad_norm": 0.13719524443149567, "learning_rate": 0.001, "loss": 2.1156, "num_input_tokens_seen": 47125823008, "step": 89900 }, { "epoch": 0.858127526622703, "grad_norm": 0.1370345801115036, "learning_rate": 0.001, "loss": 2.1146, "num_input_tokens_seen": 47152034368, "step": 89950 }, { "epoch": 0.8586045291388912, "grad_norm": 0.13640981912612915, "learning_rate": 0.001, "loss": 2.1208, "num_input_tokens_seen": 47178248768, "step": 90000 }, { "epoch": 0.8586045291388912, "eval_loss": 2.0290136337280273, "eval_runtime": 79.9812, "eval_samples_per_second": 62.515, "eval_steps_per_second": 15.629, "num_input_tokens_seen": 47178248768, "step": 90000 }, { "epoch": 0.8590815316550795, "grad_norm": 0.1481214165687561, "learning_rate": 0.001, "loss": 2.1111, "num_input_tokens_seen": 47204461440, "step": 90050 }, { "epoch": 0.8595585341712677, "grad_norm": 0.1385306715965271, "learning_rate": 0.001, "loss": 2.1111, "num_input_tokens_seen": 47230673824, "step": 90100 }, { "epoch": 0.860035536687456, "grad_norm": 0.15070556104183197, "learning_rate": 0.001, "loss": 2.1117, "num_input_tokens_seen": 47256883968, "step": 90150 }, { "epoch": 0.8605125392036443, "grad_norm": 0.14528563618659973, "learning_rate": 0.001, "loss": 2.1095, "num_input_tokens_seen": 47283091808, "step": 90200 }, { "epoch": 0.8609895417198326, "grad_norm": 0.13799038529396057, "learning_rate": 0.001, "loss": 2.108, "num_input_tokens_seen": 47309298976, "step": 90250 }, { "epoch": 0.8614665442360209, "grad_norm": 0.14631977677345276, "learning_rate": 0.001, "loss": 2.1033, "num_input_tokens_seen": 47335513376, "step": 90300 }, { "epoch": 0.8619435467522091, "grad_norm": 0.14224396646022797, "learning_rate": 0.001, "loss": 2.1157, "num_input_tokens_seen": 47361721504, "step": 90350 }, { "epoch": 0.8624205492683974, "grad_norm": 0.14005549252033234, "learning_rate": 0.001, "loss": 2.1103, "num_input_tokens_seen": 47387932000, "step": 90400 }, { "epoch": 0.8628975517845857, "grad_norm": 0.13566839694976807, "learning_rate": 0.001, "loss": 2.1042, "num_input_tokens_seen": 47414144928, "step": 90450 }, { "epoch": 0.8633745543007739, "grad_norm": 0.14136169850826263, "learning_rate": 0.001, "loss": 2.1056, "num_input_tokens_seen": 47440355904, "step": 90500 }, { "epoch": 0.8633745543007739, "eval_loss": 2.028252124786377, "eval_runtime": 79.9732, "eval_samples_per_second": 62.521, "eval_steps_per_second": 15.63, "num_input_tokens_seen": 47440355904, "step": 90500 }, { "epoch": 0.8638515568169622, "grad_norm": 0.13514867424964905, "learning_rate": 0.001, "loss": 2.1127, "num_input_tokens_seen": 47466570304, "step": 90550 }, { "epoch": 0.8643285593331504, "grad_norm": 0.13685061037540436, "learning_rate": 0.001, "loss": 2.1115, "num_input_tokens_seen": 47492780416, "step": 90600 }, { "epoch": 0.8648055618493388, "grad_norm": 0.13860297203063965, "learning_rate": 0.001, "loss": 2.1125, "num_input_tokens_seen": 47518992000, "step": 90650 }, { "epoch": 0.865282564365527, "grad_norm": 0.13972771167755127, "learning_rate": 0.001, "loss": 2.1231, "num_input_tokens_seen": 47545201600, "step": 90700 }, { "epoch": 0.8657595668817153, "grad_norm": 0.14439938962459564, "learning_rate": 0.001, "loss": 2.1118, "num_input_tokens_seen": 47571409888, "step": 90750 }, { "epoch": 0.8662365693979036, "grad_norm": 0.1624486893415451, "learning_rate": 0.001, "loss": 2.1081, "num_input_tokens_seen": 47597620352, "step": 90800 }, { "epoch": 0.8667135719140918, "grad_norm": 0.1499566286802292, "learning_rate": 0.001, "loss": 2.1115, "num_input_tokens_seen": 47623825344, "step": 90850 }, { "epoch": 0.8671905744302801, "grad_norm": 0.14615119993686676, "learning_rate": 0.001, "loss": 2.1223, "num_input_tokens_seen": 47650039392, "step": 90900 }, { "epoch": 0.8676675769464683, "grad_norm": 0.14285366237163544, "learning_rate": 0.001, "loss": 2.1164, "num_input_tokens_seen": 47676251808, "step": 90950 }, { "epoch": 0.8681445794626567, "grad_norm": 0.13764303922653198, "learning_rate": 0.001, "loss": 2.1071, "num_input_tokens_seen": 47702460032, "step": 91000 }, { "epoch": 0.8681445794626567, "eval_loss": 2.0275421142578125, "eval_runtime": 80.4452, "eval_samples_per_second": 62.154, "eval_steps_per_second": 15.539, "num_input_tokens_seen": 47702460032, "step": 91000 }, { "epoch": 0.868621581978845, "grad_norm": 0.146665558218956, "learning_rate": 0.001, "loss": 2.1203, "num_input_tokens_seen": 47728669632, "step": 91050 }, { "epoch": 0.8690985844950332, "grad_norm": 0.15256023406982422, "learning_rate": 0.001, "loss": 2.1144, "num_input_tokens_seen": 47754867168, "step": 91100 }, { "epoch": 0.8695755870112215, "grad_norm": 0.14853611588478088, "learning_rate": 0.001, "loss": 2.1136, "num_input_tokens_seen": 47781081056, "step": 91150 }, { "epoch": 0.8700525895274097, "grad_norm": 0.16396841406822205, "learning_rate": 0.001, "loss": 2.1116, "num_input_tokens_seen": 47807287712, "step": 91200 }, { "epoch": 0.870529592043598, "grad_norm": 0.15022516250610352, "learning_rate": 0.001, "loss": 2.1237, "num_input_tokens_seen": 47833489472, "step": 91250 }, { "epoch": 0.8710065945597864, "grad_norm": 0.1452953815460205, "learning_rate": 0.001, "loss": 2.1158, "num_input_tokens_seen": 47859697664, "step": 91300 }, { "epoch": 0.8714835970759746, "grad_norm": 0.14615213871002197, "learning_rate": 0.001, "loss": 2.1081, "num_input_tokens_seen": 47885910336, "step": 91350 }, { "epoch": 0.8719605995921629, "grad_norm": 0.14509530365467072, "learning_rate": 0.001, "loss": 2.1182, "num_input_tokens_seen": 47912122848, "step": 91400 }, { "epoch": 0.8724376021083511, "grad_norm": 0.14017629623413086, "learning_rate": 0.001, "loss": 2.1148, "num_input_tokens_seen": 47938323776, "step": 91450 }, { "epoch": 0.8729146046245394, "grad_norm": 0.14566557109355927, "learning_rate": 0.001, "loss": 2.1069, "num_input_tokens_seen": 47964538176, "step": 91500 }, { "epoch": 0.8729146046245394, "eval_loss": 2.027026414871216, "eval_runtime": 79.9343, "eval_samples_per_second": 62.551, "eval_steps_per_second": 15.638, "num_input_tokens_seen": 47964538176, "step": 91500 }, { "epoch": 0.8733916071407277, "grad_norm": 0.1404925286769867, "learning_rate": 0.001, "loss": 2.1115, "num_input_tokens_seen": 47990748160, "step": 91550 }, { "epoch": 0.8738686096569159, "grad_norm": 0.14250437915325165, "learning_rate": 0.001, "loss": 2.1123, "num_input_tokens_seen": 48016960768, "step": 91600 }, { "epoch": 0.8743456121731042, "grad_norm": 0.14528650045394897, "learning_rate": 0.001, "loss": 2.1101, "num_input_tokens_seen": 48043175072, "step": 91650 }, { "epoch": 0.8748226146892925, "grad_norm": 0.13685448467731476, "learning_rate": 0.001, "loss": 2.1151, "num_input_tokens_seen": 48069383456, "step": 91700 }, { "epoch": 0.8752996172054808, "grad_norm": 0.14179499447345734, "learning_rate": 0.001, "loss": 2.1015, "num_input_tokens_seen": 48095593984, "step": 91750 }, { "epoch": 0.875776619721669, "grad_norm": 0.1447928100824356, "learning_rate": 0.001, "loss": 2.1114, "num_input_tokens_seen": 48121802368, "step": 91800 }, { "epoch": 0.8762536222378573, "grad_norm": 0.13791429996490479, "learning_rate": 0.001, "loss": 2.1099, "num_input_tokens_seen": 48148014048, "step": 91850 }, { "epoch": 0.8767306247540456, "grad_norm": 0.16552382707595825, "learning_rate": 0.001, "loss": 2.1114, "num_input_tokens_seen": 48174228448, "step": 91900 }, { "epoch": 0.8772076272702338, "grad_norm": 0.14140479266643524, "learning_rate": 0.001, "loss": 2.1097, "num_input_tokens_seen": 48200439552, "step": 91950 }, { "epoch": 0.8776846297864221, "grad_norm": 0.14821244776248932, "learning_rate": 0.001, "loss": 2.1003, "num_input_tokens_seen": 48226649920, "step": 92000 }, { "epoch": 0.8776846297864221, "eval_loss": 2.0305159091949463, "eval_runtime": 79.559, "eval_samples_per_second": 62.846, "eval_steps_per_second": 15.712, "num_input_tokens_seen": 48226649920, "step": 92000 }, { "epoch": 0.8781616323026104, "grad_norm": 0.15452982485294342, "learning_rate": 0.001, "loss": 2.1059, "num_input_tokens_seen": 48252864320, "step": 92050 }, { "epoch": 0.8786386348187987, "grad_norm": 0.13859480619430542, "learning_rate": 0.001, "loss": 2.1111, "num_input_tokens_seen": 48279076256, "step": 92100 }, { "epoch": 0.879115637334987, "grad_norm": 0.13759450614452362, "learning_rate": 0.001, "loss": 2.1025, "num_input_tokens_seen": 48305290432, "step": 92150 }, { "epoch": 0.8795926398511752, "grad_norm": 0.14123345911502838, "learning_rate": 0.001, "loss": 2.1072, "num_input_tokens_seen": 48331503296, "step": 92200 }, { "epoch": 0.8800696423673635, "grad_norm": 0.14411857724189758, "learning_rate": 0.001, "loss": 2.1119, "num_input_tokens_seen": 48357715264, "step": 92250 }, { "epoch": 0.8805466448835517, "grad_norm": 0.14408078789710999, "learning_rate": 0.001, "loss": 2.1135, "num_input_tokens_seen": 48383927936, "step": 92300 }, { "epoch": 0.88102364739974, "grad_norm": 0.1592986285686493, "learning_rate": 0.001, "loss": 2.1052, "num_input_tokens_seen": 48410142336, "step": 92350 }, { "epoch": 0.8815006499159284, "grad_norm": 0.15246793627738953, "learning_rate": 0.001, "loss": 2.1082, "num_input_tokens_seen": 48436356736, "step": 92400 }, { "epoch": 0.8819776524321166, "grad_norm": 0.14505651593208313, "learning_rate": 0.001, "loss": 2.1158, "num_input_tokens_seen": 48462565600, "step": 92450 }, { "epoch": 0.8824546549483049, "grad_norm": 0.13918310403823853, "learning_rate": 0.001, "loss": 2.102, "num_input_tokens_seen": 48488770560, "step": 92500 }, { "epoch": 0.8824546549483049, "eval_loss": 2.0255165100097656, "eval_runtime": 80.9026, "eval_samples_per_second": 61.803, "eval_steps_per_second": 15.451, "num_input_tokens_seen": 48488770560, "step": 92500 }, { "epoch": 0.8829316574644931, "grad_norm": 0.14701320230960846, "learning_rate": 0.001, "loss": 2.1127, "num_input_tokens_seen": 48514976384, "step": 92550 }, { "epoch": 0.8834086599806814, "grad_norm": 0.14102083444595337, "learning_rate": 0.001, "loss": 2.1091, "num_input_tokens_seen": 48541187392, "step": 92600 }, { "epoch": 0.8838856624968696, "grad_norm": 0.14818648993968964, "learning_rate": 0.001, "loss": 2.1097, "num_input_tokens_seen": 48567390624, "step": 92650 }, { "epoch": 0.8843626650130579, "grad_norm": 0.15168142318725586, "learning_rate": 0.001, "loss": 2.125, "num_input_tokens_seen": 48593598688, "step": 92700 }, { "epoch": 0.8848396675292463, "grad_norm": 0.14235848188400269, "learning_rate": 0.001, "loss": 2.1114, "num_input_tokens_seen": 48619802048, "step": 92750 }, { "epoch": 0.8853166700454345, "grad_norm": 0.15503767132759094, "learning_rate": 0.001, "loss": 2.1129, "num_input_tokens_seen": 48646006848, "step": 92800 }, { "epoch": 0.8857936725616228, "grad_norm": 0.13856704533100128, "learning_rate": 0.001, "loss": 2.1055, "num_input_tokens_seen": 48672218720, "step": 92850 }, { "epoch": 0.886270675077811, "grad_norm": 0.14264176785945892, "learning_rate": 0.001, "loss": 2.1163, "num_input_tokens_seen": 48698417760, "step": 92900 }, { "epoch": 0.8867476775939993, "grad_norm": 0.13919401168823242, "learning_rate": 0.001, "loss": 2.1061, "num_input_tokens_seen": 48724617440, "step": 92950 }, { "epoch": 0.8872246801101876, "grad_norm": 0.1560058891773224, "learning_rate": 0.001, "loss": 2.1141, "num_input_tokens_seen": 48750829152, "step": 93000 }, { "epoch": 0.8872246801101876, "eval_loss": 2.0258500576019287, "eval_runtime": 80.6659, "eval_samples_per_second": 61.984, "eval_steps_per_second": 15.496, "num_input_tokens_seen": 48750829152, "step": 93000 }, { "epoch": 0.8877016826263758, "grad_norm": 0.13532903790473938, "learning_rate": 0.001, "loss": 2.11, "num_input_tokens_seen": 48777041568, "step": 93050 }, { "epoch": 0.8881786851425642, "grad_norm": 0.15079811215400696, "learning_rate": 0.001, "loss": 2.1125, "num_input_tokens_seen": 48803252992, "step": 93100 }, { "epoch": 0.8886556876587524, "grad_norm": 0.14035262167453766, "learning_rate": 0.001, "loss": 2.1126, "num_input_tokens_seen": 48829457344, "step": 93150 }, { "epoch": 0.8891326901749407, "grad_norm": 0.14490865170955658, "learning_rate": 0.001, "loss": 2.1058, "num_input_tokens_seen": 48855661760, "step": 93200 }, { "epoch": 0.889609692691129, "grad_norm": 0.14975398778915405, "learning_rate": 0.001, "loss": 2.1079, "num_input_tokens_seen": 48881871328, "step": 93250 }, { "epoch": 0.8900866952073172, "grad_norm": 0.14744842052459717, "learning_rate": 0.001, "loss": 2.1128, "num_input_tokens_seen": 48908085728, "step": 93300 }, { "epoch": 0.8905636977235055, "grad_norm": 0.14297208189964294, "learning_rate": 0.001, "loss": 2.1188, "num_input_tokens_seen": 48934300128, "step": 93350 }, { "epoch": 0.8910407002396937, "grad_norm": 0.1417332887649536, "learning_rate": 0.001, "loss": 2.1138, "num_input_tokens_seen": 48960513184, "step": 93400 }, { "epoch": 0.891517702755882, "grad_norm": 0.1589946150779724, "learning_rate": 0.001, "loss": 2.1187, "num_input_tokens_seen": 48986726080, "step": 93450 }, { "epoch": 0.8919947052720704, "grad_norm": 0.14446181058883667, "learning_rate": 0.001, "loss": 2.1241, "num_input_tokens_seen": 49012940480, "step": 93500 }, { "epoch": 0.8919947052720704, "eval_loss": 2.0254712104797363, "eval_runtime": 79.7436, "eval_samples_per_second": 62.701, "eval_steps_per_second": 15.675, "num_input_tokens_seen": 49012940480, "step": 93500 }, { "epoch": 0.8924717077882586, "grad_norm": 0.15279428660869598, "learning_rate": 0.001, "loss": 2.1113, "num_input_tokens_seen": 49039153024, "step": 93550 }, { "epoch": 0.8929487103044469, "grad_norm": 0.1449560672044754, "learning_rate": 0.001, "loss": 2.1117, "num_input_tokens_seen": 49065367424, "step": 93600 }, { "epoch": 0.8934257128206351, "grad_norm": 0.13819773495197296, "learning_rate": 0.001, "loss": 2.0989, "num_input_tokens_seen": 49091579424, "step": 93650 }, { "epoch": 0.8939027153368234, "grad_norm": 0.13857555389404297, "learning_rate": 0.001, "loss": 2.1183, "num_input_tokens_seen": 49117788864, "step": 93700 }, { "epoch": 0.8943797178530116, "grad_norm": 0.14007195830345154, "learning_rate": 0.001, "loss": 2.1133, "num_input_tokens_seen": 49143999488, "step": 93750 }, { "epoch": 0.8948567203691999, "grad_norm": 0.13965079188346863, "learning_rate": 0.001, "loss": 2.1097, "num_input_tokens_seen": 49170200000, "step": 93800 }, { "epoch": 0.8953337228853883, "grad_norm": 0.1414870172739029, "learning_rate": 0.001, "loss": 2.1164, "num_input_tokens_seen": 49196410944, "step": 93850 }, { "epoch": 0.8958107254015765, "grad_norm": 0.15663990378379822, "learning_rate": 0.001, "loss": 2.1086, "num_input_tokens_seen": 49222620992, "step": 93900 }, { "epoch": 0.8962877279177648, "grad_norm": 0.15661224722862244, "learning_rate": 0.001, "loss": 2.1091, "num_input_tokens_seen": 49248834304, "step": 93950 }, { "epoch": 0.896764730433953, "grad_norm": 0.1401936262845993, "learning_rate": 0.001, "loss": 2.1154, "num_input_tokens_seen": 49275046560, "step": 94000 }, { "epoch": 0.896764730433953, "eval_loss": 2.022953510284424, "eval_runtime": 80.2932, "eval_samples_per_second": 62.272, "eval_steps_per_second": 15.568, "num_input_tokens_seen": 49275046560, "step": 94000 }, { "epoch": 0.8972417329501413, "grad_norm": 0.1521204262971878, "learning_rate": 0.001, "loss": 2.1044, "num_input_tokens_seen": 49301242560, "step": 94050 }, { "epoch": 0.8977187354663296, "grad_norm": 0.1409813016653061, "learning_rate": 0.001, "loss": 2.1165, "num_input_tokens_seen": 49327452352, "step": 94100 }, { "epoch": 0.8981957379825178, "grad_norm": 0.1482156664133072, "learning_rate": 0.001, "loss": 2.1056, "num_input_tokens_seen": 49353664096, "step": 94150 }, { "epoch": 0.8986727404987062, "grad_norm": 0.15649978816509247, "learning_rate": 0.001, "loss": 2.1015, "num_input_tokens_seen": 49379874368, "step": 94200 }, { "epoch": 0.8991497430148944, "grad_norm": 0.1503802388906479, "learning_rate": 0.001, "loss": 2.1087, "num_input_tokens_seen": 49406084320, "step": 94250 }, { "epoch": 0.8996267455310827, "grad_norm": 0.1439296454191208, "learning_rate": 0.001, "loss": 2.1153, "num_input_tokens_seen": 49432296992, "step": 94300 }, { "epoch": 0.900103748047271, "grad_norm": 0.14262431859970093, "learning_rate": 0.001, "loss": 2.1128, "num_input_tokens_seen": 49458508640, "step": 94350 }, { "epoch": 0.9005807505634592, "grad_norm": 0.14529536664485931, "learning_rate": 0.001, "loss": 2.1072, "num_input_tokens_seen": 49484719072, "step": 94400 }, { "epoch": 0.9010577530796475, "grad_norm": 0.1515061855316162, "learning_rate": 0.001, "loss": 2.1033, "num_input_tokens_seen": 49510932320, "step": 94450 }, { "epoch": 0.9015347555958357, "grad_norm": 0.14318937063217163, "learning_rate": 0.001, "loss": 2.0967, "num_input_tokens_seen": 49537142912, "step": 94500 }, { "epoch": 0.9015347555958357, "eval_loss": 2.023240327835083, "eval_runtime": 80.5534, "eval_samples_per_second": 62.071, "eval_steps_per_second": 15.518, "num_input_tokens_seen": 49537142912, "step": 94500 }, { "epoch": 0.902011758112024, "grad_norm": 0.14540798962116241, "learning_rate": 0.001, "loss": 2.1117, "num_input_tokens_seen": 49563352672, "step": 94550 }, { "epoch": 0.9024887606282123, "grad_norm": 0.15588590502738953, "learning_rate": 0.001, "loss": 2.1148, "num_input_tokens_seen": 49589555168, "step": 94600 }, { "epoch": 0.9029657631444006, "grad_norm": 0.14033040404319763, "learning_rate": 0.001, "loss": 2.1072, "num_input_tokens_seen": 49615768512, "step": 94650 }, { "epoch": 0.9034427656605889, "grad_norm": 0.1453922539949417, "learning_rate": 0.001, "loss": 2.1151, "num_input_tokens_seen": 49641970368, "step": 94700 }, { "epoch": 0.9039197681767771, "grad_norm": 0.14980725944042206, "learning_rate": 0.001, "loss": 2.1093, "num_input_tokens_seen": 49668181024, "step": 94750 }, { "epoch": 0.9043967706929654, "grad_norm": 0.14013737440109253, "learning_rate": 0.001, "loss": 2.1214, "num_input_tokens_seen": 49694392704, "step": 94800 }, { "epoch": 0.9048737732091536, "grad_norm": 0.13809648156166077, "learning_rate": 0.001, "loss": 2.1151, "num_input_tokens_seen": 49720606848, "step": 94850 }, { "epoch": 0.905350775725342, "grad_norm": 0.13267497718334198, "learning_rate": 0.001, "loss": 2.1058, "num_input_tokens_seen": 49746811360, "step": 94900 }, { "epoch": 0.9058277782415303, "grad_norm": 0.1532643884420395, "learning_rate": 0.001, "loss": 2.1101, "num_input_tokens_seen": 49773025760, "step": 94950 }, { "epoch": 0.9063047807577185, "grad_norm": 0.13915950059890747, "learning_rate": 0.001, "loss": 2.1101, "num_input_tokens_seen": 49799231680, "step": 95000 }, { "epoch": 0.9063047807577185, "eval_loss": 2.024012804031372, "eval_runtime": 80.2612, "eval_samples_per_second": 62.297, "eval_steps_per_second": 15.574, "num_input_tokens_seen": 49799231680, "step": 95000 }, { "epoch": 0.9067817832739068, "grad_norm": 0.15056970715522766, "learning_rate": 0.001, "loss": 2.1047, "num_input_tokens_seen": 49825444800, "step": 95050 }, { "epoch": 0.907258785790095, "grad_norm": 0.13517211377620697, "learning_rate": 0.001, "loss": 2.0979, "num_input_tokens_seen": 49851655808, "step": 95100 }, { "epoch": 0.9077357883062833, "grad_norm": 0.13956350088119507, "learning_rate": 0.001, "loss": 2.1086, "num_input_tokens_seen": 49877867392, "step": 95150 }, { "epoch": 0.9082127908224716, "grad_norm": 0.1523425281047821, "learning_rate": 0.001, "loss": 2.0936, "num_input_tokens_seen": 49904076064, "step": 95200 }, { "epoch": 0.9086897933386598, "grad_norm": 0.15285497903823853, "learning_rate": 0.001, "loss": 2.1048, "num_input_tokens_seen": 49930288160, "step": 95250 }, { "epoch": 0.9091667958548482, "grad_norm": 0.14413221180438995, "learning_rate": 0.001, "loss": 2.0946, "num_input_tokens_seen": 49956501184, "step": 95300 }, { "epoch": 0.9096437983710364, "grad_norm": 0.1461506485939026, "learning_rate": 0.001, "loss": 2.1084, "num_input_tokens_seen": 49982711360, "step": 95350 }, { "epoch": 0.9101208008872247, "grad_norm": 0.13794639706611633, "learning_rate": 0.001, "loss": 2.1014, "num_input_tokens_seen": 50008921440, "step": 95400 }, { "epoch": 0.9105978034034129, "grad_norm": 0.14720895886421204, "learning_rate": 0.001, "loss": 2.1017, "num_input_tokens_seen": 50035120864, "step": 95450 }, { "epoch": 0.9110748059196012, "grad_norm": 0.14016789197921753, "learning_rate": 0.001, "loss": 2.1087, "num_input_tokens_seen": 50061331936, "step": 95500 }, { "epoch": 0.9110748059196012, "eval_loss": 2.0230298042297363, "eval_runtime": 79.7075, "eval_samples_per_second": 62.729, "eval_steps_per_second": 15.682, "num_input_tokens_seen": 50061331936, "step": 95500 }, { "epoch": 0.9115518084357895, "grad_norm": 0.15129534900188446, "learning_rate": 0.001, "loss": 2.1033, "num_input_tokens_seen": 50087536032, "step": 95550 }, { "epoch": 0.9120288109519777, "grad_norm": 0.14662089943885803, "learning_rate": 0.001, "loss": 2.1099, "num_input_tokens_seen": 50113747872, "step": 95600 }, { "epoch": 0.9125058134681661, "grad_norm": 0.15536580979824066, "learning_rate": 0.001, "loss": 2.1292, "num_input_tokens_seen": 50139961728, "step": 95650 }, { "epoch": 0.9129828159843543, "grad_norm": 0.14354456961154938, "learning_rate": 0.001, "loss": 2.105, "num_input_tokens_seen": 50166173152, "step": 95700 }, { "epoch": 0.9134598185005426, "grad_norm": 0.15019798278808594, "learning_rate": 0.001, "loss": 2.1131, "num_input_tokens_seen": 50192384928, "step": 95750 }, { "epoch": 0.9139368210167309, "grad_norm": 0.13612700998783112, "learning_rate": 0.001, "loss": 2.1122, "num_input_tokens_seen": 50218597600, "step": 95800 }, { "epoch": 0.9144138235329191, "grad_norm": 0.1439824402332306, "learning_rate": 0.001, "loss": 2.1018, "num_input_tokens_seen": 50244810528, "step": 95850 }, { "epoch": 0.9148908260491074, "grad_norm": 0.15556299686431885, "learning_rate": 0.001, "loss": 2.1064, "num_input_tokens_seen": 50271019008, "step": 95900 }, { "epoch": 0.9153678285652956, "grad_norm": 0.1479777693748474, "learning_rate": 0.001, "loss": 2.1111, "num_input_tokens_seen": 50297228896, "step": 95950 }, { "epoch": 0.915844831081484, "grad_norm": 0.14080928266048431, "learning_rate": 0.001, "loss": 2.1011, "num_input_tokens_seen": 50323443136, "step": 96000 }, { "epoch": 0.915844831081484, "eval_loss": 2.022911310195923, "eval_runtime": 79.6192, "eval_samples_per_second": 62.799, "eval_steps_per_second": 15.7, "num_input_tokens_seen": 50323443136, "step": 96000 }, { "epoch": 0.9163218335976723, "grad_norm": 0.14819885790348053, "learning_rate": 0.001, "loss": 2.1138, "num_input_tokens_seen": 50349653824, "step": 96050 }, { "epoch": 0.9167988361138605, "grad_norm": 0.1338687390089035, "learning_rate": 0.001, "loss": 2.1068, "num_input_tokens_seen": 50375857728, "step": 96100 }, { "epoch": 0.9172758386300488, "grad_norm": 0.14282946288585663, "learning_rate": 0.001, "loss": 2.1155, "num_input_tokens_seen": 50402064096, "step": 96150 }, { "epoch": 0.917752841146237, "grad_norm": 0.137980654835701, "learning_rate": 0.001, "loss": 2.0963, "num_input_tokens_seen": 50428274592, "step": 96200 }, { "epoch": 0.9182298436624253, "grad_norm": 0.1530529260635376, "learning_rate": 0.001, "loss": 2.1074, "num_input_tokens_seen": 50454485120, "step": 96250 }, { "epoch": 0.9187068461786136, "grad_norm": 0.15306447446346283, "learning_rate": 0.001, "loss": 2.1107, "num_input_tokens_seen": 50480694976, "step": 96300 }, { "epoch": 0.9191838486948019, "grad_norm": 0.13567544519901276, "learning_rate": 0.001, "loss": 2.1087, "num_input_tokens_seen": 50506903392, "step": 96350 }, { "epoch": 0.9196608512109902, "grad_norm": 0.14647279679775238, "learning_rate": 0.001, "loss": 2.1087, "num_input_tokens_seen": 50533115264, "step": 96400 }, { "epoch": 0.9201378537271784, "grad_norm": 0.14072103798389435, "learning_rate": 0.001, "loss": 2.1003, "num_input_tokens_seen": 50559324224, "step": 96450 }, { "epoch": 0.9206148562433667, "grad_norm": 0.1334242820739746, "learning_rate": 0.001, "loss": 2.1052, "num_input_tokens_seen": 50585537568, "step": 96500 }, { "epoch": 0.9206148562433667, "eval_loss": 2.0225579738616943, "eval_runtime": 79.9486, "eval_samples_per_second": 62.54, "eval_steps_per_second": 15.635, "num_input_tokens_seen": 50585537568, "step": 96500 }, { "epoch": 0.9210918587595549, "grad_norm": 0.1523120254278183, "learning_rate": 0.001, "loss": 2.098, "num_input_tokens_seen": 50611746976, "step": 96550 }, { "epoch": 0.9215688612757432, "grad_norm": 0.14341165125370026, "learning_rate": 0.001, "loss": 2.1105, "num_input_tokens_seen": 50637952320, "step": 96600 }, { "epoch": 0.9220458637919315, "grad_norm": 0.15297015011310577, "learning_rate": 0.001, "loss": 2.1116, "num_input_tokens_seen": 50664162720, "step": 96650 }, { "epoch": 0.9225228663081197, "grad_norm": 0.15151242911815643, "learning_rate": 0.001, "loss": 2.1109, "num_input_tokens_seen": 50690366624, "step": 96700 }, { "epoch": 0.9229998688243081, "grad_norm": 0.14462804794311523, "learning_rate": 0.001, "loss": 2.117, "num_input_tokens_seen": 50716579904, "step": 96750 }, { "epoch": 0.9234768713404963, "grad_norm": 0.1390417069196701, "learning_rate": 0.001, "loss": 2.1016, "num_input_tokens_seen": 50742794304, "step": 96800 }, { "epoch": 0.9239538738566846, "grad_norm": 0.14151330292224884, "learning_rate": 0.001, "loss": 2.1032, "num_input_tokens_seen": 50769000928, "step": 96850 }, { "epoch": 0.9244308763728729, "grad_norm": 0.1432236135005951, "learning_rate": 0.001, "loss": 2.1133, "num_input_tokens_seen": 50795209632, "step": 96900 }, { "epoch": 0.9249078788890611, "grad_norm": 0.14917080104351044, "learning_rate": 0.001, "loss": 2.1092, "num_input_tokens_seen": 50821418560, "step": 96950 }, { "epoch": 0.9253848814052494, "grad_norm": 0.14105528593063354, "learning_rate": 0.001, "loss": 2.1024, "num_input_tokens_seen": 50847631776, "step": 97000 }, { "epoch": 0.9253848814052494, "eval_loss": 2.0225133895874023, "eval_runtime": 79.6596, "eval_samples_per_second": 62.767, "eval_steps_per_second": 15.692, "num_input_tokens_seen": 50847631776, "step": 97000 }, { "epoch": 0.9258618839214376, "grad_norm": 0.1577770859003067, "learning_rate": 0.001, "loss": 2.1079, "num_input_tokens_seen": 50873841088, "step": 97050 }, { "epoch": 0.926338886437626, "grad_norm": 0.13983358442783356, "learning_rate": 0.001, "loss": 2.1037, "num_input_tokens_seen": 50900050112, "step": 97100 }, { "epoch": 0.9268158889538143, "grad_norm": 0.14196738600730896, "learning_rate": 0.001, "loss": 2.109, "num_input_tokens_seen": 50926261280, "step": 97150 }, { "epoch": 0.9272928914700025, "grad_norm": 0.1525181531906128, "learning_rate": 0.001, "loss": 2.1104, "num_input_tokens_seen": 50952473344, "step": 97200 }, { "epoch": 0.9277698939861908, "grad_norm": 0.14153936505317688, "learning_rate": 0.001, "loss": 2.1188, "num_input_tokens_seen": 50978687744, "step": 97250 }, { "epoch": 0.928246896502379, "grad_norm": 0.13389533758163452, "learning_rate": 0.001, "loss": 2.1066, "num_input_tokens_seen": 51004893696, "step": 97300 }, { "epoch": 0.9287238990185673, "grad_norm": 0.1618724912405014, "learning_rate": 0.001, "loss": 2.109, "num_input_tokens_seen": 51031102752, "step": 97350 }, { "epoch": 0.9292009015347555, "grad_norm": 0.146076038479805, "learning_rate": 0.001, "loss": 2.1104, "num_input_tokens_seen": 51057314816, "step": 97400 }, { "epoch": 0.9296779040509439, "grad_norm": 0.14311222732067108, "learning_rate": 0.001, "loss": 2.1074, "num_input_tokens_seen": 51083520832, "step": 97450 }, { "epoch": 0.9301549065671322, "grad_norm": 0.1500881314277649, "learning_rate": 0.001, "loss": 2.1032, "num_input_tokens_seen": 51109732512, "step": 97500 }, { "epoch": 0.9301549065671322, "eval_loss": 2.022352695465088, "eval_runtime": 80.43, "eval_samples_per_second": 62.166, "eval_steps_per_second": 15.541, "num_input_tokens_seen": 51109732512, "step": 97500 }, { "epoch": 0.9306319090833204, "grad_norm": 0.15114109218120575, "learning_rate": 0.001, "loss": 2.109, "num_input_tokens_seen": 51135938816, "step": 97550 }, { "epoch": 0.9311089115995087, "grad_norm": 0.1397593915462494, "learning_rate": 0.001, "loss": 2.1089, "num_input_tokens_seen": 51162148352, "step": 97600 }, { "epoch": 0.9315859141156969, "grad_norm": 0.13163454830646515, "learning_rate": 0.001, "loss": 2.0974, "num_input_tokens_seen": 51188357216, "step": 97650 }, { "epoch": 0.9320629166318852, "grad_norm": 0.1726287603378296, "learning_rate": 0.001, "loss": 2.1258, "num_input_tokens_seen": 51214571616, "step": 97700 }, { "epoch": 0.9325399191480735, "grad_norm": 0.14246715605258942, "learning_rate": 0.001, "loss": 2.0959, "num_input_tokens_seen": 51240780352, "step": 97750 }, { "epoch": 0.9330169216642618, "grad_norm": 0.13136450946331024, "learning_rate": 0.001, "loss": 2.1083, "num_input_tokens_seen": 51266988320, "step": 97800 }, { "epoch": 0.9334939241804501, "grad_norm": 0.15461480617523193, "learning_rate": 0.001, "loss": 2.106, "num_input_tokens_seen": 51293202720, "step": 97850 }, { "epoch": 0.9339709266966383, "grad_norm": 0.14553368091583252, "learning_rate": 0.001, "loss": 2.1029, "num_input_tokens_seen": 51319415616, "step": 97900 }, { "epoch": 0.9344479292128266, "grad_norm": 0.14998067915439606, "learning_rate": 0.001, "loss": 2.1174, "num_input_tokens_seen": 51345619808, "step": 97950 }, { "epoch": 0.9349249317290149, "grad_norm": 0.14988018572330475, "learning_rate": 0.001, "loss": 2.0984, "num_input_tokens_seen": 51371830432, "step": 98000 }, { "epoch": 0.9349249317290149, "eval_loss": 2.02105712890625, "eval_runtime": 80.5093, "eval_samples_per_second": 62.105, "eval_steps_per_second": 15.526, "num_input_tokens_seen": 51371830432, "step": 98000 }, { "epoch": 0.9354019342452031, "grad_norm": 0.1521526575088501, "learning_rate": 0.001, "loss": 2.0988, "num_input_tokens_seen": 51398036448, "step": 98050 }, { "epoch": 0.9358789367613914, "grad_norm": 0.13992975652217865, "learning_rate": 0.001, "loss": 2.1074, "num_input_tokens_seen": 51424242880, "step": 98100 }, { "epoch": 0.9363559392775797, "grad_norm": 0.1415923684835434, "learning_rate": 0.001, "loss": 2.0952, "num_input_tokens_seen": 51450457280, "step": 98150 }, { "epoch": 0.936832941793768, "grad_norm": 0.15722358226776123, "learning_rate": 0.001, "loss": 2.1148, "num_input_tokens_seen": 51476663840, "step": 98200 }, { "epoch": 0.9373099443099563, "grad_norm": 0.14942607283592224, "learning_rate": 0.001, "loss": 2.1009, "num_input_tokens_seen": 51502876256, "step": 98250 }, { "epoch": 0.9377869468261445, "grad_norm": 0.1397363543510437, "learning_rate": 0.001, "loss": 2.1083, "num_input_tokens_seen": 51529086944, "step": 98300 }, { "epoch": 0.9382639493423328, "grad_norm": 0.14004074037075043, "learning_rate": 0.001, "loss": 2.1009, "num_input_tokens_seen": 51555301344, "step": 98350 }, { "epoch": 0.938740951858521, "grad_norm": 0.15313847362995148, "learning_rate": 0.001, "loss": 2.1016, "num_input_tokens_seen": 51581506560, "step": 98400 }, { "epoch": 0.9392179543747093, "grad_norm": 0.1391165405511856, "learning_rate": 0.001, "loss": 2.1054, "num_input_tokens_seen": 51607720960, "step": 98450 }, { "epoch": 0.9396949568908975, "grad_norm": 0.15387007594108582, "learning_rate": 0.001, "loss": 2.1133, "num_input_tokens_seen": 51633931744, "step": 98500 }, { "epoch": 0.9396949568908975, "eval_loss": 2.0206305980682373, "eval_runtime": 79.5875, "eval_samples_per_second": 62.824, "eval_steps_per_second": 15.706, "num_input_tokens_seen": 51633931744, "step": 98500 }, { "epoch": 0.9401719594070859, "grad_norm": 0.15347087383270264, "learning_rate": 0.001, "loss": 2.1066, "num_input_tokens_seen": 51660140800, "step": 98550 }, { "epoch": 0.9406489619232742, "grad_norm": 0.14395572245121002, "learning_rate": 0.001, "loss": 2.104, "num_input_tokens_seen": 51686349760, "step": 98600 }, { "epoch": 0.9411259644394624, "grad_norm": 0.1397567093372345, "learning_rate": 0.001, "loss": 2.1054, "num_input_tokens_seen": 51712553888, "step": 98650 }, { "epoch": 0.9416029669556507, "grad_norm": 0.14795741438865662, "learning_rate": 0.001, "loss": 2.0911, "num_input_tokens_seen": 51738759968, "step": 98700 }, { "epoch": 0.9420799694718389, "grad_norm": 0.15419213473796844, "learning_rate": 0.001, "loss": 2.1068, "num_input_tokens_seen": 51764972992, "step": 98750 }, { "epoch": 0.9425569719880272, "grad_norm": 0.14047859609127045, "learning_rate": 0.001, "loss": 2.1007, "num_input_tokens_seen": 51791183968, "step": 98800 }, { "epoch": 0.9430339745042156, "grad_norm": 0.15431874990463257, "learning_rate": 0.001, "loss": 2.1006, "num_input_tokens_seen": 51817396704, "step": 98850 }, { "epoch": 0.9435109770204038, "grad_norm": 0.14634360373020172, "learning_rate": 0.001, "loss": 2.113, "num_input_tokens_seen": 51843609440, "step": 98900 }, { "epoch": 0.9439879795365921, "grad_norm": 0.13855423033237457, "learning_rate": 0.001, "loss": 2.0972, "num_input_tokens_seen": 51869820832, "step": 98950 }, { "epoch": 0.9444649820527803, "grad_norm": 0.14774784445762634, "learning_rate": 0.001, "loss": 2.1038, "num_input_tokens_seen": 51896030848, "step": 99000 }, { "epoch": 0.9444649820527803, "eval_loss": 2.020467519760132, "eval_runtime": 80.0716, "eval_samples_per_second": 62.444, "eval_steps_per_second": 15.611, "num_input_tokens_seen": 51896030848, "step": 99000 }, { "epoch": 0.9449419845689686, "grad_norm": 0.12909556925296783, "learning_rate": 0.001, "loss": 2.1117, "num_input_tokens_seen": 51922241632, "step": 99050 }, { "epoch": 0.9454189870851569, "grad_norm": 0.14898623526096344, "learning_rate": 0.001, "loss": 2.098, "num_input_tokens_seen": 51948454816, "step": 99100 }, { "epoch": 0.9458959896013451, "grad_norm": 0.1472884863615036, "learning_rate": 0.001, "loss": 2.1143, "num_input_tokens_seen": 51974667424, "step": 99150 }, { "epoch": 0.9463729921175335, "grad_norm": 0.15299226343631744, "learning_rate": 0.001, "loss": 2.1064, "num_input_tokens_seen": 52000872800, "step": 99200 }, { "epoch": 0.9468499946337217, "grad_norm": 0.15084148943424225, "learning_rate": 0.001, "loss": 2.108, "num_input_tokens_seen": 52027087200, "step": 99250 }, { "epoch": 0.94732699714991, "grad_norm": 0.14663194119930267, "learning_rate": 0.001, "loss": 2.1039, "num_input_tokens_seen": 52053301600, "step": 99300 }, { "epoch": 0.9478039996660982, "grad_norm": 0.1480923891067505, "learning_rate": 0.001, "loss": 2.103, "num_input_tokens_seen": 52079512832, "step": 99350 }, { "epoch": 0.9482810021822865, "grad_norm": 0.14708015322685242, "learning_rate": 0.001, "loss": 2.1194, "num_input_tokens_seen": 52105726016, "step": 99400 }, { "epoch": 0.9487580046984748, "grad_norm": 0.1524500995874405, "learning_rate": 0.001, "loss": 2.1092, "num_input_tokens_seen": 52131935744, "step": 99450 }, { "epoch": 0.949235007214663, "grad_norm": 0.15018624067306519, "learning_rate": 0.001, "loss": 2.1021, "num_input_tokens_seen": 52158147168, "step": 99500 }, { "epoch": 0.949235007214663, "eval_loss": 2.022200345993042, "eval_runtime": 79.553, "eval_samples_per_second": 62.851, "eval_steps_per_second": 15.713, "num_input_tokens_seen": 52158147168, "step": 99500 }, { "epoch": 0.9497120097308513, "grad_norm": 0.13732361793518066, "learning_rate": 0.001, "loss": 2.1062, "num_input_tokens_seen": 52184356352, "step": 99550 }, { "epoch": 0.9501890122470396, "grad_norm": 0.14235976338386536, "learning_rate": 0.001, "loss": 2.1168, "num_input_tokens_seen": 52210570752, "step": 99600 }, { "epoch": 0.9506660147632279, "grad_norm": 0.1550482213497162, "learning_rate": 0.001, "loss": 2.1053, "num_input_tokens_seen": 52236785152, "step": 99650 }, { "epoch": 0.9511430172794162, "grad_norm": 0.1557578146457672, "learning_rate": 0.001, "loss": 2.1026, "num_input_tokens_seen": 52262997760, "step": 99700 }, { "epoch": 0.9516200197956044, "grad_norm": 0.1451166272163391, "learning_rate": 0.001, "loss": 2.0997, "num_input_tokens_seen": 52289205056, "step": 99750 }, { "epoch": 0.9520970223117927, "grad_norm": 0.15717899799346924, "learning_rate": 0.001, "loss": 2.1088, "num_input_tokens_seen": 52315417376, "step": 99800 }, { "epoch": 0.9525740248279809, "grad_norm": 0.14595787227153778, "learning_rate": 0.001, "loss": 2.1015, "num_input_tokens_seen": 52341619488, "step": 99850 }, { "epoch": 0.9530510273441692, "grad_norm": 0.1477060317993164, "learning_rate": 0.001, "loss": 2.0991, "num_input_tokens_seen": 52367830048, "step": 99900 }, { "epoch": 0.9535280298603576, "grad_norm": 0.15001972019672394, "learning_rate": 0.001, "loss": 2.1039, "num_input_tokens_seen": 52394036064, "step": 99950 }, { "epoch": 0.9540050323765458, "grad_norm": 0.13796518743038177, "learning_rate": 0.001, "loss": 2.1116, "num_input_tokens_seen": 52420247968, "step": 100000 }, { "epoch": 0.9540050323765458, "eval_loss": 2.0211336612701416, "eval_runtime": 80.1109, "eval_samples_per_second": 62.413, "eval_steps_per_second": 15.603, "num_input_tokens_seen": 52420247968, "step": 100000 }, { "epoch": 0.9544820348927341, "grad_norm": 0.13650113344192505, "learning_rate": 0.001, "loss": 2.1018, "num_input_tokens_seen": 52446460160, "step": 100050 }, { "epoch": 0.9549590374089223, "grad_norm": 0.14391744136810303, "learning_rate": 0.001, "loss": 2.1004, "num_input_tokens_seen": 52472671616, "step": 100100 }, { "epoch": 0.9554360399251106, "grad_norm": 0.14373421669006348, "learning_rate": 0.001, "loss": 2.1157, "num_input_tokens_seen": 52498882464, "step": 100150 }, { "epoch": 0.9559130424412989, "grad_norm": 0.14512939751148224, "learning_rate": 0.001, "loss": 2.0999, "num_input_tokens_seen": 52525095328, "step": 100200 }, { "epoch": 0.9563900449574871, "grad_norm": 0.14848357439041138, "learning_rate": 0.001, "loss": 2.1042, "num_input_tokens_seen": 52551306848, "step": 100250 }, { "epoch": 0.9568670474736755, "grad_norm": 0.14673751592636108, "learning_rate": 0.001, "loss": 2.0989, "num_input_tokens_seen": 52577512416, "step": 100300 }, { "epoch": 0.9573440499898637, "grad_norm": 0.13780055940151215, "learning_rate": 0.001, "loss": 2.1075, "num_input_tokens_seen": 52603721312, "step": 100350 }, { "epoch": 0.957821052506052, "grad_norm": 0.33724644780158997, "learning_rate": 0.001, "loss": 2.0947, "num_input_tokens_seen": 52629927296, "step": 100400 }, { "epoch": 0.9582980550222402, "grad_norm": 0.14704908430576324, "learning_rate": 0.001, "loss": 2.102, "num_input_tokens_seen": 52656138304, "step": 100450 }, { "epoch": 0.9587750575384285, "grad_norm": 0.14224405586719513, "learning_rate": 0.001, "loss": 2.1026, "num_input_tokens_seen": 52682343360, "step": 100500 }, { "epoch": 0.9587750575384285, "eval_loss": 2.0196518898010254, "eval_runtime": 79.8344, "eval_samples_per_second": 62.63, "eval_steps_per_second": 15.657, "num_input_tokens_seen": 52682343360, "step": 100500 }, { "epoch": 0.9592520600546168, "grad_norm": 0.14282722771167755, "learning_rate": 0.001, "loss": 2.0973, "num_input_tokens_seen": 52708556800, "step": 100550 }, { "epoch": 0.959729062570805, "grad_norm": 0.1467045545578003, "learning_rate": 0.001, "loss": 2.1036, "num_input_tokens_seen": 52734764768, "step": 100600 }, { "epoch": 0.9602060650869934, "grad_norm": 0.16190816462039948, "learning_rate": 0.001, "loss": 2.1033, "num_input_tokens_seen": 52760968160, "step": 100650 }, { "epoch": 0.9606830676031816, "grad_norm": 0.1406693458557129, "learning_rate": 0.001, "loss": 2.105, "num_input_tokens_seen": 52787178912, "step": 100700 }, { "epoch": 0.9611600701193699, "grad_norm": 0.15562649071216583, "learning_rate": 0.001, "loss": 2.1021, "num_input_tokens_seen": 52813388832, "step": 100750 }, { "epoch": 0.9616370726355582, "grad_norm": 0.15426361560821533, "learning_rate": 0.001, "loss": 2.0955, "num_input_tokens_seen": 52839602848, "step": 100800 }, { "epoch": 0.9621140751517464, "grad_norm": 0.14050264656543732, "learning_rate": 0.001, "loss": 2.1041, "num_input_tokens_seen": 52865814016, "step": 100850 }, { "epoch": 0.9625910776679347, "grad_norm": 0.1460646390914917, "learning_rate": 0.001, "loss": 2.1052, "num_input_tokens_seen": 52892026784, "step": 100900 }, { "epoch": 0.9630680801841229, "grad_norm": 0.14038728177547455, "learning_rate": 0.001, "loss": 2.1044, "num_input_tokens_seen": 52918238624, "step": 100950 }, { "epoch": 0.9635450827003113, "grad_norm": 0.15031979978084564, "learning_rate": 0.001, "loss": 2.1031, "num_input_tokens_seen": 52944447232, "step": 101000 }, { "epoch": 0.9635450827003113, "eval_loss": 2.020580530166626, "eval_runtime": 79.7901, "eval_samples_per_second": 62.664, "eval_steps_per_second": 15.666, "num_input_tokens_seen": 52944447232, "step": 101000 }, { "epoch": 0.9640220852164996, "grad_norm": 0.1539318561553955, "learning_rate": 0.001, "loss": 2.0988, "num_input_tokens_seen": 52970660704, "step": 101050 }, { "epoch": 0.9644990877326878, "grad_norm": 0.16137336194515228, "learning_rate": 0.001, "loss": 2.1055, "num_input_tokens_seen": 52996875104, "step": 101100 }, { "epoch": 0.9649760902488761, "grad_norm": 0.14159482717514038, "learning_rate": 0.001, "loss": 2.1, "num_input_tokens_seen": 53023078848, "step": 101150 }, { "epoch": 0.9654530927650643, "grad_norm": 0.1358761340379715, "learning_rate": 0.001, "loss": 2.0979, "num_input_tokens_seen": 53049291552, "step": 101200 }, { "epoch": 0.9659300952812526, "grad_norm": 0.14332178235054016, "learning_rate": 0.001, "loss": 2.1029, "num_input_tokens_seen": 53075499328, "step": 101250 }, { "epoch": 0.9664070977974408, "grad_norm": 0.14132554829120636, "learning_rate": 0.001, "loss": 2.1036, "num_input_tokens_seen": 53101712448, "step": 101300 }, { "epoch": 0.9668841003136291, "grad_norm": 0.15662212669849396, "learning_rate": 0.001, "loss": 2.1036, "num_input_tokens_seen": 53127918176, "step": 101350 }, { "epoch": 0.9673611028298175, "grad_norm": 0.14150604605674744, "learning_rate": 0.001, "loss": 2.1108, "num_input_tokens_seen": 53154132192, "step": 101400 }, { "epoch": 0.9678381053460057, "grad_norm": 0.14353616535663605, "learning_rate": 0.001, "loss": 2.1046, "num_input_tokens_seen": 53180344448, "step": 101450 }, { "epoch": 0.968315107862194, "grad_norm": 0.13792720437049866, "learning_rate": 0.001, "loss": 2.1127, "num_input_tokens_seen": 53206553568, "step": 101500 }, { "epoch": 0.968315107862194, "eval_loss": 2.0184688568115234, "eval_runtime": 79.7642, "eval_samples_per_second": 62.685, "eval_steps_per_second": 15.671, "num_input_tokens_seen": 53206553568, "step": 101500 }, { "epoch": 0.9687921103783822, "grad_norm": 0.13691678643226624, "learning_rate": 0.001, "loss": 2.1006, "num_input_tokens_seen": 53232767200, "step": 101550 }, { "epoch": 0.9692691128945705, "grad_norm": 0.1575424075126648, "learning_rate": 0.001, "loss": 2.0954, "num_input_tokens_seen": 53258974944, "step": 101600 }, { "epoch": 0.9697461154107588, "grad_norm": 0.15218724310398102, "learning_rate": 0.001, "loss": 2.0979, "num_input_tokens_seen": 53285187296, "step": 101650 }, { "epoch": 0.970223117926947, "grad_norm": 0.1503322720527649, "learning_rate": 0.001, "loss": 2.1027, "num_input_tokens_seen": 53311394144, "step": 101700 }, { "epoch": 0.9707001204431354, "grad_norm": 0.16736505925655365, "learning_rate": 0.001, "loss": 2.1031, "num_input_tokens_seen": 53337604896, "step": 101750 }, { "epoch": 0.9711771229593236, "grad_norm": 0.14036568999290466, "learning_rate": 0.001, "loss": 2.1171, "num_input_tokens_seen": 53363816960, "step": 101800 }, { "epoch": 0.9716541254755119, "grad_norm": 0.1408475935459137, "learning_rate": 0.001, "loss": 2.0966, "num_input_tokens_seen": 53390028096, "step": 101850 }, { "epoch": 0.9721311279917002, "grad_norm": 0.146541029214859, "learning_rate": 0.001, "loss": 2.1082, "num_input_tokens_seen": 53416241344, "step": 101900 }, { "epoch": 0.9726081305078884, "grad_norm": 0.1453496813774109, "learning_rate": 0.001, "loss": 2.0955, "num_input_tokens_seen": 53442449632, "step": 101950 }, { "epoch": 0.9730851330240767, "grad_norm": 0.14003250002861023, "learning_rate": 0.001, "loss": 2.0992, "num_input_tokens_seen": 53468658336, "step": 102000 }, { "epoch": 0.9730851330240767, "eval_loss": 2.0179874897003174, "eval_runtime": 80.0076, "eval_samples_per_second": 62.494, "eval_steps_per_second": 15.624, "num_input_tokens_seen": 53468658336, "step": 102000 }, { "epoch": 0.9735621355402649, "grad_norm": 0.14326773583889008, "learning_rate": 0.001, "loss": 2.0932, "num_input_tokens_seen": 53494871072, "step": 102050 }, { "epoch": 0.9740391380564533, "grad_norm": 0.13583138585090637, "learning_rate": 0.001, "loss": 2.0911, "num_input_tokens_seen": 53521073504, "step": 102100 }, { "epoch": 0.9745161405726416, "grad_norm": 0.13492164015769958, "learning_rate": 0.001, "loss": 2.1079, "num_input_tokens_seen": 53547285632, "step": 102150 }, { "epoch": 0.9749931430888298, "grad_norm": 0.1361209750175476, "learning_rate": 0.001, "loss": 2.1075, "num_input_tokens_seen": 53573498688, "step": 102200 }, { "epoch": 0.9754701456050181, "grad_norm": 0.15009672939777374, "learning_rate": 0.001, "loss": 2.1096, "num_input_tokens_seen": 53599709344, "step": 102250 }, { "epoch": 0.9759471481212063, "grad_norm": 0.14739353954792023, "learning_rate": 0.001, "loss": 2.1113, "num_input_tokens_seen": 53625921312, "step": 102300 }, { "epoch": 0.9764241506373946, "grad_norm": 0.13327108323574066, "learning_rate": 0.001, "loss": 2.0993, "num_input_tokens_seen": 53652133472, "step": 102350 }, { "epoch": 0.9769011531535828, "grad_norm": 0.15938664972782135, "learning_rate": 0.001, "loss": 2.1044, "num_input_tokens_seen": 53678346208, "step": 102400 }, { "epoch": 0.9773781556697712, "grad_norm": 0.14256815612316132, "learning_rate": 0.001, "loss": 2.0995, "num_input_tokens_seen": 53704558400, "step": 102450 }, { "epoch": 0.9778551581859595, "grad_norm": 3.5962650775909424, "learning_rate": 0.001, "loss": 2.0968, "num_input_tokens_seen": 53730772800, "step": 102500 }, { "epoch": 0.9778551581859595, "eval_loss": 2.0211129188537598, "eval_runtime": 80.3018, "eval_samples_per_second": 62.265, "eval_steps_per_second": 15.566, "num_input_tokens_seen": 53730772800, "step": 102500 }, { "epoch": 0.9783321607021477, "grad_norm": 0.14861001074314117, "learning_rate": 0.001, "loss": 2.1211, "num_input_tokens_seen": 53756986432, "step": 102550 }, { "epoch": 0.978809163218336, "grad_norm": 0.13950598239898682, "learning_rate": 0.001, "loss": 2.1026, "num_input_tokens_seen": 53783199840, "step": 102600 }, { "epoch": 0.9792861657345242, "grad_norm": 0.14435075223445892, "learning_rate": 0.001, "loss": 2.1062, "num_input_tokens_seen": 53809407904, "step": 102650 }, { "epoch": 0.9797631682507125, "grad_norm": 0.1405237913131714, "learning_rate": 0.001, "loss": 2.1138, "num_input_tokens_seen": 53835613280, "step": 102700 }, { "epoch": 0.9802401707669008, "grad_norm": 0.1563555896282196, "learning_rate": 0.001, "loss": 2.104, "num_input_tokens_seen": 53861818848, "step": 102750 }, { "epoch": 0.980717173283089, "grad_norm": 0.15651467442512512, "learning_rate": 0.001, "loss": 2.1134, "num_input_tokens_seen": 53888025952, "step": 102800 }, { "epoch": 0.9811941757992774, "grad_norm": 0.1491318941116333, "learning_rate": 0.001, "loss": 2.1179, "num_input_tokens_seen": 53914237056, "step": 102850 }, { "epoch": 0.9816711783154656, "grad_norm": 0.15486833453178406, "learning_rate": 0.001, "loss": 2.1086, "num_input_tokens_seen": 53940442144, "step": 102900 }, { "epoch": 0.9821481808316539, "grad_norm": 0.14997461438179016, "learning_rate": 0.001, "loss": 2.1029, "num_input_tokens_seen": 53966652960, "step": 102950 }, { "epoch": 0.9826251833478422, "grad_norm": 0.16969485580921173, "learning_rate": 0.001, "loss": 2.092, "num_input_tokens_seen": 53992863264, "step": 103000 }, { "epoch": 0.9826251833478422, "eval_loss": 2.017953634262085, "eval_runtime": 80.0806, "eval_samples_per_second": 62.437, "eval_steps_per_second": 15.609, "num_input_tokens_seen": 53992863264, "step": 103000 }, { "epoch": 0.9831021858640304, "grad_norm": 0.1433195322751999, "learning_rate": 0.001, "loss": 2.1045, "num_input_tokens_seen": 54019076320, "step": 103050 }, { "epoch": 0.9835791883802187, "grad_norm": 0.13524982333183289, "learning_rate": 0.001, "loss": 2.1099, "num_input_tokens_seen": 54045286592, "step": 103100 }, { "epoch": 0.984056190896407, "grad_norm": 0.13636088371276855, "learning_rate": 0.001, "loss": 2.1139, "num_input_tokens_seen": 54071491616, "step": 103150 }, { "epoch": 0.9845331934125953, "grad_norm": 0.14049679040908813, "learning_rate": 0.001, "loss": 2.1031, "num_input_tokens_seen": 54097697184, "step": 103200 }, { "epoch": 0.9850101959287835, "grad_norm": 0.13444900512695312, "learning_rate": 0.001, "loss": 2.0941, "num_input_tokens_seen": 54123909408, "step": 103250 }, { "epoch": 0.9854871984449718, "grad_norm": 0.136467844247818, "learning_rate": 0.001, "loss": 2.1055, "num_input_tokens_seen": 54150121152, "step": 103300 }, { "epoch": 0.9859642009611601, "grad_norm": 0.14821065962314606, "learning_rate": 0.001, "loss": 2.1062, "num_input_tokens_seen": 54176333824, "step": 103350 }, { "epoch": 0.9864412034773483, "grad_norm": 0.15114334225654602, "learning_rate": 0.001, "loss": 2.0973, "num_input_tokens_seen": 54202545376, "step": 103400 }, { "epoch": 0.9869182059935366, "grad_norm": 0.13535051047801971, "learning_rate": 0.001, "loss": 2.1059, "num_input_tokens_seen": 54228754304, "step": 103450 }, { "epoch": 0.9873952085097248, "grad_norm": 0.13229253888130188, "learning_rate": 0.001, "loss": 2.1016, "num_input_tokens_seen": 54254966464, "step": 103500 }, { "epoch": 0.9873952085097248, "eval_loss": 2.0163238048553467, "eval_runtime": 80.7299, "eval_samples_per_second": 61.935, "eval_steps_per_second": 15.484, "num_input_tokens_seen": 54254966464, "step": 103500 }, { "epoch": 0.9878722110259132, "grad_norm": 0.13689298927783966, "learning_rate": 0.001, "loss": 2.0935, "num_input_tokens_seen": 54281180864, "step": 103550 }, { "epoch": 0.9883492135421015, "grad_norm": 0.13803903758525848, "learning_rate": 0.001, "loss": 2.1036, "num_input_tokens_seen": 54307394080, "step": 103600 }, { "epoch": 0.9888262160582897, "grad_norm": 0.1357845515012741, "learning_rate": 0.001, "loss": 2.0962, "num_input_tokens_seen": 54333608480, "step": 103650 }, { "epoch": 0.989303218574478, "grad_norm": 0.14026238024234772, "learning_rate": 0.001, "loss": 2.1001, "num_input_tokens_seen": 54359813856, "step": 103700 }, { "epoch": 0.9897802210906662, "grad_norm": 0.14243866503238678, "learning_rate": 0.001, "loss": 2.1065, "num_input_tokens_seen": 54386025280, "step": 103750 }, { "epoch": 0.9902572236068545, "grad_norm": 0.14214347302913666, "learning_rate": 0.001, "loss": 2.1031, "num_input_tokens_seen": 54412235712, "step": 103800 }, { "epoch": 0.9907342261230428, "grad_norm": 0.13983015716075897, "learning_rate": 0.001, "loss": 2.0936, "num_input_tokens_seen": 54438450112, "step": 103850 }, { "epoch": 0.9912112286392311, "grad_norm": 0.15471301972866058, "learning_rate": 0.001, "loss": 2.0967, "num_input_tokens_seen": 54464659648, "step": 103900 }, { "epoch": 0.9916882311554194, "grad_norm": 0.14359566569328308, "learning_rate": 0.001, "loss": 2.0986, "num_input_tokens_seen": 54490872800, "step": 103950 }, { "epoch": 0.9921652336716076, "grad_norm": 0.149493008852005, "learning_rate": 0.001, "loss": 2.1098, "num_input_tokens_seen": 54517083360, "step": 104000 }, { "epoch": 0.9921652336716076, "eval_loss": 2.0173633098602295, "eval_runtime": 79.9475, "eval_samples_per_second": 62.541, "eval_steps_per_second": 15.635, "num_input_tokens_seen": 54517083360, "step": 104000 }, { "epoch": 0.9926422361877959, "grad_norm": 0.1350771188735962, "learning_rate": 0.001, "loss": 2.0914, "num_input_tokens_seen": 54543296096, "step": 104050 }, { "epoch": 0.9931192387039841, "grad_norm": 0.14034296572208405, "learning_rate": 0.001, "loss": 2.1122, "num_input_tokens_seen": 54569506560, "step": 104100 }, { "epoch": 0.9935962412201724, "grad_norm": 0.14050833880901337, "learning_rate": 0.001, "loss": 2.1046, "num_input_tokens_seen": 54595720960, "step": 104150 }, { "epoch": 0.9940732437363607, "grad_norm": 0.1437423974275589, "learning_rate": 0.001, "loss": 2.0914, "num_input_tokens_seen": 54621935360, "step": 104200 }, { "epoch": 0.994550246252549, "grad_norm": 0.1375901997089386, "learning_rate": 0.001, "loss": 2.1054, "num_input_tokens_seen": 54648141216, "step": 104250 }, { "epoch": 0.9950272487687373, "grad_norm": 0.14355972409248352, "learning_rate": 0.001, "loss": 2.1057, "num_input_tokens_seen": 54674354528, "step": 104300 }, { "epoch": 0.9955042512849255, "grad_norm": 0.1537715196609497, "learning_rate": 0.001, "loss": 2.102, "num_input_tokens_seen": 54700562880, "step": 104350 }, { "epoch": 0.9959812538011138, "grad_norm": 0.14420664310455322, "learning_rate": 0.001, "loss": 2.1207, "num_input_tokens_seen": 54726772416, "step": 104400 }, { "epoch": 0.9964582563173021, "grad_norm": 0.14655081927776337, "learning_rate": 0.001, "loss": 2.0958, "num_input_tokens_seen": 54752977152, "step": 104450 }, { "epoch": 0.9969352588334903, "grad_norm": 0.15202060341835022, "learning_rate": 0.001, "loss": 2.1103, "num_input_tokens_seen": 54779191328, "step": 104500 }, { "epoch": 0.9969352588334903, "eval_loss": 2.0176143646240234, "eval_runtime": 79.7652, "eval_samples_per_second": 62.684, "eval_steps_per_second": 15.671, "num_input_tokens_seen": 54779191328, "step": 104500 }, { "epoch": 0.9974122613496786, "grad_norm": 0.15168489515781403, "learning_rate": 0.001, "loss": 2.0964, "num_input_tokens_seen": 54805401728, "step": 104550 }, { "epoch": 0.9978892638658668, "grad_norm": 0.14086578786373138, "learning_rate": 0.001, "loss": 2.1098, "num_input_tokens_seen": 54831615168, "step": 104600 }, { "epoch": 0.9983662663820552, "grad_norm": 0.14561446011066437, "learning_rate": 0.001, "loss": 2.0925, "num_input_tokens_seen": 54857821280, "step": 104650 }, { "epoch": 0.9988432688982435, "grad_norm": 0.16160067915916443, "learning_rate": 0.001, "loss": 2.1016, "num_input_tokens_seen": 54884026784, "step": 104700 }, { "epoch": 0.9993202714144317, "grad_norm": 0.14295999705791473, "learning_rate": 0.001, "loss": 2.0871, "num_input_tokens_seen": 54910241184, "step": 104750 }, { "epoch": 0.99979727393062, "grad_norm": 0.163029745221138, "learning_rate": 0.001, "loss": 2.1087, "num_input_tokens_seen": 54936452640, "step": 104800 }, { "epoch": 1.0002766614593892, "grad_norm": 0.14676038920879364, "learning_rate": 0.001, "loss": 2.1678, "num_input_tokens_seen": 54962782880, "step": 104850 }, { "epoch": 1.0007536639755774, "grad_norm": 0.14179593324661255, "learning_rate": 0.001, "loss": 2.0905, "num_input_tokens_seen": 54988994528, "step": 104900 }, { "epoch": 1.0012306664917658, "grad_norm": 0.1496460884809494, "learning_rate": 0.001, "loss": 2.0938, "num_input_tokens_seen": 55015208928, "step": 104950 }, { "epoch": 1.001707669007954, "grad_norm": 0.15026605129241943, "learning_rate": 0.001, "loss": 2.0879, "num_input_tokens_seen": 55041423104, "step": 105000 }, { "epoch": 1.001707669007954, "eval_loss": 2.0179128646850586, "eval_runtime": 80.8772, "eval_samples_per_second": 61.822, "eval_steps_per_second": 15.456, "num_input_tokens_seen": 55041423104, "step": 105000 }, { "epoch": 1.0021846715241423, "grad_norm": 0.13333049416542053, "learning_rate": 0.001, "loss": 2.0942, "num_input_tokens_seen": 55067629792, "step": 105050 }, { "epoch": 1.0026616740403305, "grad_norm": 0.15017394721508026, "learning_rate": 0.001, "loss": 2.0944, "num_input_tokens_seen": 55093844192, "step": 105100 }, { "epoch": 1.003138676556519, "grad_norm": 0.14982599020004272, "learning_rate": 0.001, "loss": 2.0949, "num_input_tokens_seen": 55120057696, "step": 105150 }, { "epoch": 1.003615679072707, "grad_norm": 0.13318419456481934, "learning_rate": 0.001, "loss": 2.0903, "num_input_tokens_seen": 55146267488, "step": 105200 }, { "epoch": 1.0040926815888953, "grad_norm": 0.13913436233997345, "learning_rate": 0.001, "loss": 2.0905, "num_input_tokens_seen": 55172480896, "step": 105250 }, { "epoch": 1.0045696841050837, "grad_norm": 0.14818261563777924, "learning_rate": 0.001, "loss": 2.096, "num_input_tokens_seen": 55198691328, "step": 105300 }, { "epoch": 1.005046686621272, "grad_norm": 0.15057435631752014, "learning_rate": 0.001, "loss": 2.0929, "num_input_tokens_seen": 55224898976, "step": 105350 }, { "epoch": 1.0055236891374602, "grad_norm": 0.15632683038711548, "learning_rate": 0.001, "loss": 2.0921, "num_input_tokens_seen": 55251107776, "step": 105400 }, { "epoch": 1.0060006916536486, "grad_norm": 0.14498716592788696, "learning_rate": 0.001, "loss": 2.0977, "num_input_tokens_seen": 55277321088, "step": 105450 }, { "epoch": 1.0064776941698368, "grad_norm": 0.14519184827804565, "learning_rate": 0.001, "loss": 2.0964, "num_input_tokens_seen": 55303527552, "step": 105500 }, { "epoch": 1.0064776941698368, "eval_loss": 2.0157995223999023, "eval_runtime": 80.5085, "eval_samples_per_second": 62.105, "eval_steps_per_second": 15.526, "num_input_tokens_seen": 55303527552, "step": 105500 }, { "epoch": 1.006954696686025, "grad_norm": 0.15698741376399994, "learning_rate": 0.001, "loss": 2.0974, "num_input_tokens_seen": 55329739936, "step": 105550 }, { "epoch": 1.0074316992022132, "grad_norm": 0.1432969570159912, "learning_rate": 0.001, "loss": 2.0991, "num_input_tokens_seen": 55355942944, "step": 105600 }, { "epoch": 1.0079087017184016, "grad_norm": 0.13702726364135742, "learning_rate": 0.001, "loss": 2.0935, "num_input_tokens_seen": 55382157344, "step": 105650 }, { "epoch": 1.0083857042345898, "grad_norm": 0.14623892307281494, "learning_rate": 0.001, "loss": 2.0941, "num_input_tokens_seen": 55408359808, "step": 105700 }, { "epoch": 1.008862706750778, "grad_norm": 0.14641566574573517, "learning_rate": 0.001, "loss": 2.1119, "num_input_tokens_seen": 55434571200, "step": 105750 }, { "epoch": 1.0093397092669665, "grad_norm": 0.14116981625556946, "learning_rate": 0.001, "loss": 2.086, "num_input_tokens_seen": 55460781792, "step": 105800 }, { "epoch": 1.0098167117831547, "grad_norm": 0.14575734734535217, "learning_rate": 0.001, "loss": 2.1121, "num_input_tokens_seen": 55486986944, "step": 105850 }, { "epoch": 1.010293714299343, "grad_norm": 0.14387919008731842, "learning_rate": 0.001, "loss": 2.0949, "num_input_tokens_seen": 55513194784, "step": 105900 }, { "epoch": 1.010770716815531, "grad_norm": 0.14621268212795258, "learning_rate": 0.001, "loss": 2.0832, "num_input_tokens_seen": 55539409184, "step": 105950 }, { "epoch": 1.0112477193317195, "grad_norm": 0.1453128159046173, "learning_rate": 0.001, "loss": 2.0989, "num_input_tokens_seen": 55565623584, "step": 106000 }, { "epoch": 1.0112477193317195, "eval_loss": 2.0156850814819336, "eval_runtime": 80.793, "eval_samples_per_second": 61.887, "eval_steps_per_second": 15.472, "num_input_tokens_seen": 55565623584, "step": 106000 }, { "epoch": 1.0117247218479077, "grad_norm": 0.1500042974948883, "learning_rate": 0.001, "loss": 2.0889, "num_input_tokens_seen": 55591837984, "step": 106050 }, { "epoch": 1.012201724364096, "grad_norm": 0.14505235850811005, "learning_rate": 0.001, "loss": 2.1002, "num_input_tokens_seen": 55618045632, "step": 106100 }, { "epoch": 1.0126787268802844, "grad_norm": 0.141426682472229, "learning_rate": 0.001, "loss": 2.091, "num_input_tokens_seen": 55644253280, "step": 106150 }, { "epoch": 1.0131557293964726, "grad_norm": 0.1420578509569168, "learning_rate": 0.001, "loss": 2.0987, "num_input_tokens_seen": 55670466400, "step": 106200 }, { "epoch": 1.0136327319126608, "grad_norm": 0.14393913745880127, "learning_rate": 0.001, "loss": 2.0841, "num_input_tokens_seen": 55696673792, "step": 106250 }, { "epoch": 1.0141097344288492, "grad_norm": 0.14071914553642273, "learning_rate": 0.001, "loss": 2.0893, "num_input_tokens_seen": 55722877888, "step": 106300 }, { "epoch": 1.0145867369450374, "grad_norm": 0.13708269596099854, "learning_rate": 0.001, "loss": 2.0979, "num_input_tokens_seen": 55749083968, "step": 106350 }, { "epoch": 1.0150637394612256, "grad_norm": 0.20387065410614014, "learning_rate": 0.001, "loss": 2.2896, "num_input_tokens_seen": 55775290528, "step": 106400 }, { "epoch": 1.0155407419774138, "grad_norm": 0.1379322111606598, "learning_rate": 0.001, "loss": 2.1229, "num_input_tokens_seen": 55801504320, "step": 106450 }, { "epoch": 1.0160177444936023, "grad_norm": 0.14372999966144562, "learning_rate": 0.001, "loss": 2.1079, "num_input_tokens_seen": 55827708736, "step": 106500 }, { "epoch": 1.0160177444936023, "eval_loss": 2.018646001815796, "eval_runtime": 80.5516, "eval_samples_per_second": 62.072, "eval_steps_per_second": 15.518, "num_input_tokens_seen": 55827708736, "step": 106500 }, { "epoch": 1.0164947470097905, "grad_norm": 0.13034076988697052, "learning_rate": 0.001, "loss": 2.0805, "num_input_tokens_seen": 55853917024, "step": 106550 }, { "epoch": 1.0169717495259787, "grad_norm": 0.12940867245197296, "learning_rate": 0.001, "loss": 2.0837, "num_input_tokens_seen": 55880125632, "step": 106600 }, { "epoch": 1.0174487520421671, "grad_norm": 0.13367190957069397, "learning_rate": 0.001, "loss": 2.0991, "num_input_tokens_seen": 55906331840, "step": 106650 }, { "epoch": 1.0179257545583553, "grad_norm": 0.13605284690856934, "learning_rate": 0.001, "loss": 2.0973, "num_input_tokens_seen": 55932528160, "step": 106700 }, { "epoch": 1.0184027570745435, "grad_norm": 0.14544115960597992, "learning_rate": 0.001, "loss": 2.111, "num_input_tokens_seen": 55958742560, "step": 106750 }, { "epoch": 1.0188797595907317, "grad_norm": 0.14273081719875336, "learning_rate": 0.001, "loss": 2.0949, "num_input_tokens_seen": 55984956960, "step": 106800 }, { "epoch": 1.0193567621069202, "grad_norm": 0.14157475531101227, "learning_rate": 0.001, "loss": 2.0859, "num_input_tokens_seen": 56011159744, "step": 106850 }, { "epoch": 1.0198337646231084, "grad_norm": 0.13682667911052704, "learning_rate": 0.001, "loss": 2.0968, "num_input_tokens_seen": 56037373184, "step": 106900 }, { "epoch": 1.0203107671392966, "grad_norm": 0.14256510138511658, "learning_rate": 0.001, "loss": 2.0959, "num_input_tokens_seen": 56063582688, "step": 106950 }, { "epoch": 1.020787769655485, "grad_norm": 0.13422270119190216, "learning_rate": 0.001, "loss": 2.1069, "num_input_tokens_seen": 56089794304, "step": 107000 }, { "epoch": 1.020787769655485, "eval_loss": 2.0148062705993652, "eval_runtime": 80.2529, "eval_samples_per_second": 62.303, "eval_steps_per_second": 15.576, "num_input_tokens_seen": 56089794304, "step": 107000 }, { "epoch": 1.0212647721716732, "grad_norm": 0.1368064284324646, "learning_rate": 0.001, "loss": 2.0883, "num_input_tokens_seen": 56116003136, "step": 107050 }, { "epoch": 1.0217417746878614, "grad_norm": 0.13491053879261017, "learning_rate": 0.001, "loss": 2.0913, "num_input_tokens_seen": 56142210144, "step": 107100 }, { "epoch": 1.0222187772040499, "grad_norm": 0.14345191419124603, "learning_rate": 0.001, "loss": 2.1019, "num_input_tokens_seen": 56168416608, "step": 107150 }, { "epoch": 1.022695779720238, "grad_norm": 0.14869827032089233, "learning_rate": 0.001, "loss": 2.0929, "num_input_tokens_seen": 56194631008, "step": 107200 }, { "epoch": 1.0231727822364263, "grad_norm": 0.14153461158275604, "learning_rate": 0.001, "loss": 2.0992, "num_input_tokens_seen": 56220845408, "step": 107250 }, { "epoch": 1.0236497847526145, "grad_norm": 0.1489809900522232, "learning_rate": 0.001, "loss": 2.0875, "num_input_tokens_seen": 56247053216, "step": 107300 }, { "epoch": 1.024126787268803, "grad_norm": 0.13485555350780487, "learning_rate": 0.001, "loss": 2.0983, "num_input_tokens_seen": 56273267616, "step": 107350 }, { "epoch": 1.0246037897849911, "grad_norm": 0.13658951222896576, "learning_rate": 0.001, "loss": 2.0901, "num_input_tokens_seen": 56299482016, "step": 107400 }, { "epoch": 1.0250807923011793, "grad_norm": 0.1356600672006607, "learning_rate": 0.001, "loss": 2.1042, "num_input_tokens_seen": 56325696416, "step": 107450 }, { "epoch": 1.0255577948173678, "grad_norm": 0.1527785360813141, "learning_rate": 0.001, "loss": 2.0987, "num_input_tokens_seen": 56351910752, "step": 107500 }, { "epoch": 1.0255577948173678, "eval_loss": 2.014725685119629, "eval_runtime": 80.8852, "eval_samples_per_second": 61.816, "eval_steps_per_second": 15.454, "num_input_tokens_seen": 56351910752, "step": 107500 }, { "epoch": 1.026034797333556, "grad_norm": 0.1489991992712021, "learning_rate": 0.001, "loss": 2.0878, "num_input_tokens_seen": 56378123968, "step": 107550 }, { "epoch": 1.0265117998497442, "grad_norm": 0.1518663763999939, "learning_rate": 0.001, "loss": 2.1044, "num_input_tokens_seen": 56404334496, "step": 107600 }, { "epoch": 1.0269888023659324, "grad_norm": 0.13364924490451813, "learning_rate": 0.001, "loss": 2.0928, "num_input_tokens_seen": 56430545248, "step": 107650 }, { "epoch": 1.0274658048821208, "grad_norm": 1.7939748764038086, "learning_rate": 0.001, "loss": 2.1125, "num_input_tokens_seen": 56456755488, "step": 107700 }, { "epoch": 1.027942807398309, "grad_norm": 0.14331629872322083, "learning_rate": 0.001, "loss": 2.1346, "num_input_tokens_seen": 56482965472, "step": 107750 }, { "epoch": 1.0284198099144972, "grad_norm": 0.14626429975032806, "learning_rate": 0.001, "loss": 2.1037, "num_input_tokens_seen": 56509177408, "step": 107800 }, { "epoch": 1.0288968124306856, "grad_norm": 0.15549655258655548, "learning_rate": 0.001, "loss": 2.1006, "num_input_tokens_seen": 56535387392, "step": 107850 }, { "epoch": 1.0293738149468739, "grad_norm": 0.13863249123096466, "learning_rate": 0.001, "loss": 2.1056, "num_input_tokens_seen": 56561593632, "step": 107900 }, { "epoch": 1.029850817463062, "grad_norm": 0.1429344117641449, "learning_rate": 0.001, "loss": 2.0983, "num_input_tokens_seen": 56587805408, "step": 107950 }, { "epoch": 1.0303278199792505, "grad_norm": 0.14651237428188324, "learning_rate": 0.001, "loss": 2.095, "num_input_tokens_seen": 56614015392, "step": 108000 }, { "epoch": 1.0303278199792505, "eval_loss": 2.015336036682129, "eval_runtime": 80.5904, "eval_samples_per_second": 62.042, "eval_steps_per_second": 15.511, "num_input_tokens_seen": 56614015392, "step": 108000 }, { "epoch": 1.0308048224954387, "grad_norm": 0.1352015882730484, "learning_rate": 0.001, "loss": 2.0908, "num_input_tokens_seen": 56640217952, "step": 108050 }, { "epoch": 1.031281825011627, "grad_norm": 0.13381287455558777, "learning_rate": 0.001, "loss": 2.0902, "num_input_tokens_seen": 56666428640, "step": 108100 }, { "epoch": 1.0317588275278151, "grad_norm": 0.14579468965530396, "learning_rate": 0.001, "loss": 2.0909, "num_input_tokens_seen": 56692643040, "step": 108150 }, { "epoch": 1.0322358300440035, "grad_norm": 0.14770351350307465, "learning_rate": 0.001, "loss": 2.0971, "num_input_tokens_seen": 56718857440, "step": 108200 }, { "epoch": 1.0327128325601918, "grad_norm": 0.14099901914596558, "learning_rate": 0.001, "loss": 2.0877, "num_input_tokens_seen": 56745071840, "step": 108250 }, { "epoch": 1.03318983507638, "grad_norm": 0.13044485449790955, "learning_rate": 0.001, "loss": 2.0861, "num_input_tokens_seen": 56771285824, "step": 108300 }, { "epoch": 1.0336668375925684, "grad_norm": 0.14185436069965363, "learning_rate": 0.001, "loss": 2.1007, "num_input_tokens_seen": 56797497888, "step": 108350 }, { "epoch": 1.0341438401087566, "grad_norm": 0.1411305069923401, "learning_rate": 0.001, "loss": 2.093, "num_input_tokens_seen": 56823710368, "step": 108400 }, { "epoch": 1.0346208426249448, "grad_norm": 0.13188087940216064, "learning_rate": 0.001, "loss": 2.0855, "num_input_tokens_seen": 56849920512, "step": 108450 }, { "epoch": 1.035097845141133, "grad_norm": 0.13814617693424225, "learning_rate": 0.001, "loss": 2.097, "num_input_tokens_seen": 56876128320, "step": 108500 }, { "epoch": 1.035097845141133, "eval_loss": 2.013669490814209, "eval_runtime": 80.9952, "eval_samples_per_second": 61.732, "eval_steps_per_second": 15.433, "num_input_tokens_seen": 56876128320, "step": 108500 }, { "epoch": 1.0355748476573214, "grad_norm": 0.15740585327148438, "learning_rate": 0.001, "loss": 2.0945, "num_input_tokens_seen": 56902334400, "step": 108550 }, { "epoch": 1.0360518501735096, "grad_norm": 0.1584336757659912, "learning_rate": 0.001, "loss": 2.0828, "num_input_tokens_seen": 56928547616, "step": 108600 }, { "epoch": 1.0365288526896979, "grad_norm": 0.1397266536951065, "learning_rate": 0.001, "loss": 2.097, "num_input_tokens_seen": 56954753664, "step": 108650 }, { "epoch": 1.0370058552058863, "grad_norm": 0.14098243415355682, "learning_rate": 0.001, "loss": 2.084, "num_input_tokens_seen": 56980968064, "step": 108700 }, { "epoch": 1.0374828577220745, "grad_norm": 0.14463113248348236, "learning_rate": 0.001, "loss": 2.0938, "num_input_tokens_seen": 57007174656, "step": 108750 }, { "epoch": 1.0379598602382627, "grad_norm": 0.15365611016750336, "learning_rate": 0.001, "loss": 2.0972, "num_input_tokens_seen": 57033388672, "step": 108800 }, { "epoch": 1.0384368627544511, "grad_norm": 0.13319768011569977, "learning_rate": 0.001, "loss": 2.095, "num_input_tokens_seen": 57059598624, "step": 108850 }, { "epoch": 1.0389138652706393, "grad_norm": 0.15738600492477417, "learning_rate": 0.001, "loss": 2.1002, "num_input_tokens_seen": 57085810016, "step": 108900 }, { "epoch": 1.0393908677868275, "grad_norm": 0.14639179408550262, "learning_rate": 0.001, "loss": 2.0989, "num_input_tokens_seen": 57112024416, "step": 108950 }, { "epoch": 1.0398678703030158, "grad_norm": 0.1358439177274704, "learning_rate": 0.001, "loss": 2.099, "num_input_tokens_seen": 57138237248, "step": 109000 }, { "epoch": 1.0398678703030158, "eval_loss": 2.0128679275512695, "eval_runtime": 82.3317, "eval_samples_per_second": 60.73, "eval_steps_per_second": 15.182, "num_input_tokens_seen": 57138237248, "step": 109000 }, { "epoch": 1.0403448728192042, "grad_norm": 0.14199453592300415, "learning_rate": 0.001, "loss": 2.1023, "num_input_tokens_seen": 57164448992, "step": 109050 }, { "epoch": 1.0408218753353924, "grad_norm": 0.1462697833776474, "learning_rate": 0.001, "loss": 2.0933, "num_input_tokens_seen": 57190658656, "step": 109100 }, { "epoch": 1.0412988778515806, "grad_norm": 0.14854200184345245, "learning_rate": 0.001, "loss": 2.1052, "num_input_tokens_seen": 57216873056, "step": 109150 }, { "epoch": 1.041775880367769, "grad_norm": 0.140263170003891, "learning_rate": 0.001, "loss": 2.0874, "num_input_tokens_seen": 57243086400, "step": 109200 }, { "epoch": 1.0422528828839572, "grad_norm": 0.1429862082004547, "learning_rate": 0.001, "loss": 2.0981, "num_input_tokens_seen": 57269288928, "step": 109250 }, { "epoch": 1.0427298854001454, "grad_norm": 0.1370985209941864, "learning_rate": 0.001, "loss": 2.0779, "num_input_tokens_seen": 57295496128, "step": 109300 }, { "epoch": 1.0432068879163339, "grad_norm": 0.15176068246364594, "learning_rate": 0.001, "loss": 2.0953, "num_input_tokens_seen": 57321705568, "step": 109350 }, { "epoch": 1.043683890432522, "grad_norm": 0.13600246608257294, "learning_rate": 0.001, "loss": 2.1109, "num_input_tokens_seen": 57347914880, "step": 109400 }, { "epoch": 1.0441608929487103, "grad_norm": 0.15201528370380402, "learning_rate": 0.001, "loss": 2.0852, "num_input_tokens_seen": 57374122464, "step": 109450 }, { "epoch": 1.0446378954648985, "grad_norm": 0.13787305355072021, "learning_rate": 0.001, "loss": 2.0952, "num_input_tokens_seen": 57400333280, "step": 109500 }, { "epoch": 1.0446378954648985, "eval_loss": 2.012360095977783, "eval_runtime": 83.0266, "eval_samples_per_second": 60.222, "eval_steps_per_second": 15.055, "num_input_tokens_seen": 57400333280, "step": 109500 }, { "epoch": 1.045114897981087, "grad_norm": 0.13901057839393616, "learning_rate": 0.001, "loss": 2.0954, "num_input_tokens_seen": 57426541536, "step": 109550 }, { "epoch": 1.0455919004972751, "grad_norm": 0.13901159167289734, "learning_rate": 0.001, "loss": 2.1016, "num_input_tokens_seen": 57452749952, "step": 109600 }, { "epoch": 1.0460689030134633, "grad_norm": 0.14035074412822723, "learning_rate": 0.001, "loss": 2.1006, "num_input_tokens_seen": 57478964352, "step": 109650 }, { "epoch": 1.0465459055296518, "grad_norm": 0.13569940626621246, "learning_rate": 0.001, "loss": 2.0895, "num_input_tokens_seen": 57505178752, "step": 109700 }, { "epoch": 1.04702290804584, "grad_norm": 0.15281043946743011, "learning_rate": 0.001, "loss": 2.0948, "num_input_tokens_seen": 57531385376, "step": 109750 }, { "epoch": 1.0474999105620282, "grad_norm": 0.154220312833786, "learning_rate": 0.001, "loss": 2.0957, "num_input_tokens_seen": 57557599776, "step": 109800 }, { "epoch": 1.0479769130782164, "grad_norm": 0.1422448307275772, "learning_rate": 0.001, "loss": 2.0968, "num_input_tokens_seen": 57583807360, "step": 109850 }, { "epoch": 1.0484539155944048, "grad_norm": 0.1550014317035675, "learning_rate": 0.001, "loss": 2.0924, "num_input_tokens_seen": 57610015680, "step": 109900 }, { "epoch": 1.048930918110593, "grad_norm": 0.1353992372751236, "learning_rate": 0.001, "loss": 2.098, "num_input_tokens_seen": 57636230080, "step": 109950 }, { "epoch": 1.0494079206267812, "grad_norm": 0.15528377890586853, "learning_rate": 0.001, "loss": 2.0895, "num_input_tokens_seen": 57662439520, "step": 110000 }, { "epoch": 1.0494079206267812, "eval_loss": 2.013206958770752, "eval_runtime": 83.3169, "eval_samples_per_second": 60.012, "eval_steps_per_second": 15.003, "num_input_tokens_seen": 57662439520, "step": 110000 }, { "epoch": 1.0498849231429697, "grad_norm": 0.14630495011806488, "learning_rate": 0.001, "loss": 2.104, "num_input_tokens_seen": 57688651552, "step": 110050 }, { "epoch": 1.0503619256591579, "grad_norm": 0.15211914479732513, "learning_rate": 0.001, "loss": 2.0929, "num_input_tokens_seen": 57714851968, "step": 110100 }, { "epoch": 1.050838928175346, "grad_norm": 0.15288826823234558, "learning_rate": 0.001, "loss": 2.107, "num_input_tokens_seen": 57741066368, "step": 110150 }, { "epoch": 1.0513159306915343, "grad_norm": 0.18201424181461334, "learning_rate": 0.001, "loss": 2.0831, "num_input_tokens_seen": 57767277728, "step": 110200 }, { "epoch": 1.0517929332077227, "grad_norm": 0.15570518374443054, "learning_rate": 0.001, "loss": 2.1017, "num_input_tokens_seen": 57793489344, "step": 110250 }, { "epoch": 1.052269935723911, "grad_norm": 0.15590184926986694, "learning_rate": 0.001, "loss": 2.0851, "num_input_tokens_seen": 57819702752, "step": 110300 }, { "epoch": 1.0527469382400991, "grad_norm": 0.13825973868370056, "learning_rate": 0.001, "loss": 2.0966, "num_input_tokens_seen": 57845893728, "step": 110350 }, { "epoch": 1.0532239407562876, "grad_norm": 0.1424342691898346, "learning_rate": 0.001, "loss": 2.0915, "num_input_tokens_seen": 57872104704, "step": 110400 }, { "epoch": 1.0537009432724758, "grad_norm": 0.14073631167411804, "learning_rate": 0.001, "loss": 2.1029, "num_input_tokens_seen": 57898312928, "step": 110450 }, { "epoch": 1.054177945788664, "grad_norm": 0.1437380015850067, "learning_rate": 0.001, "loss": 2.0945, "num_input_tokens_seen": 57924525696, "step": 110500 }, { "epoch": 1.054177945788664, "eval_loss": 2.01132869720459, "eval_runtime": 83.1173, "eval_samples_per_second": 60.156, "eval_steps_per_second": 15.039, "num_input_tokens_seen": 57924525696, "step": 110500 }, { "epoch": 1.0546549483048524, "grad_norm": 0.14192169904708862, "learning_rate": 0.001, "loss": 2.0819, "num_input_tokens_seen": 57950733824, "step": 110550 }, { "epoch": 1.0551319508210406, "grad_norm": 0.13621965050697327, "learning_rate": 0.001, "loss": 2.096, "num_input_tokens_seen": 57976940256, "step": 110600 }, { "epoch": 1.0556089533372288, "grad_norm": 0.14394508302211761, "learning_rate": 0.001, "loss": 2.0959, "num_input_tokens_seen": 58003150272, "step": 110650 }, { "epoch": 1.056085955853417, "grad_norm": 0.15062682330608368, "learning_rate": 0.001, "loss": 2.0912, "num_input_tokens_seen": 58029359264, "step": 110700 }, { "epoch": 1.0565629583696055, "grad_norm": 0.15145541727542877, "learning_rate": 0.001, "loss": 2.0954, "num_input_tokens_seen": 58055566272, "step": 110750 }, { "epoch": 1.0570399608857937, "grad_norm": 0.14316266775131226, "learning_rate": 0.001, "loss": 2.0871, "num_input_tokens_seen": 58081774848, "step": 110800 }, { "epoch": 1.0575169634019819, "grad_norm": 0.1519429087638855, "learning_rate": 0.001, "loss": 2.1033, "num_input_tokens_seen": 58107987648, "step": 110850 }, { "epoch": 1.0579939659181703, "grad_norm": 0.1556522697210312, "learning_rate": 0.001, "loss": 2.0921, "num_input_tokens_seen": 58134199072, "step": 110900 }, { "epoch": 1.0584709684343585, "grad_norm": 0.14416266977787018, "learning_rate": 0.001, "loss": 2.0952, "num_input_tokens_seen": 58160410880, "step": 110950 }, { "epoch": 1.0589479709505467, "grad_norm": 0.14932425320148468, "learning_rate": 0.001, "loss": 2.0978, "num_input_tokens_seen": 58186625280, "step": 111000 }, { "epoch": 1.0589479709505467, "eval_loss": 2.010718584060669, "eval_runtime": 83.9004, "eval_samples_per_second": 59.594, "eval_steps_per_second": 14.899, "num_input_tokens_seen": 58186625280, "step": 111000 }, { "epoch": 1.0594249734667351, "grad_norm": 0.14156724512577057, "learning_rate": 0.001, "loss": 2.1053, "num_input_tokens_seen": 58212829344, "step": 111050 }, { "epoch": 1.0599019759829234, "grad_norm": 0.13802699744701385, "learning_rate": 0.001, "loss": 2.0977, "num_input_tokens_seen": 58239042144, "step": 111100 }, { "epoch": 1.0603789784991116, "grad_norm": 0.1430686116218567, "learning_rate": 0.001, "loss": 2.085, "num_input_tokens_seen": 58265254688, "step": 111150 }, { "epoch": 1.0608559810152998, "grad_norm": 0.1441573053598404, "learning_rate": 0.001, "loss": 2.0912, "num_input_tokens_seen": 58291469088, "step": 111200 }, { "epoch": 1.0613329835314882, "grad_norm": 0.14418621361255646, "learning_rate": 0.001, "loss": 2.0902, "num_input_tokens_seen": 58317683424, "step": 111250 }, { "epoch": 1.0618099860476764, "grad_norm": 0.140812486410141, "learning_rate": 0.001, "loss": 2.097, "num_input_tokens_seen": 58343897024, "step": 111300 }, { "epoch": 1.0622869885638646, "grad_norm": 0.13597142696380615, "learning_rate": 0.001, "loss": 2.0914, "num_input_tokens_seen": 58370104576, "step": 111350 }, { "epoch": 1.062763991080053, "grad_norm": 0.14179456233978271, "learning_rate": 0.001, "loss": 2.0888, "num_input_tokens_seen": 58396317792, "step": 111400 }, { "epoch": 1.0632409935962412, "grad_norm": 0.15151284635066986, "learning_rate": 0.001, "loss": 2.0825, "num_input_tokens_seen": 58422525344, "step": 111450 }, { "epoch": 1.0637179961124295, "grad_norm": 0.15292806923389435, "learning_rate": 0.001, "loss": 2.0873, "num_input_tokens_seen": 58448739744, "step": 111500 }, { "epoch": 1.0637179961124295, "eval_loss": 2.010636329650879, "eval_runtime": 82.455, "eval_samples_per_second": 60.639, "eval_steps_per_second": 15.16, "num_input_tokens_seen": 58448739744, "step": 111500 }, { "epoch": 1.0641949986286177, "grad_norm": 0.1312059611082077, "learning_rate": 0.001, "loss": 2.0916, "num_input_tokens_seen": 58474952288, "step": 111550 }, { "epoch": 1.064672001144806, "grad_norm": 0.14581365883350372, "learning_rate": 0.001, "loss": 2.08, "num_input_tokens_seen": 58501161280, "step": 111600 }, { "epoch": 1.0651490036609943, "grad_norm": 0.1432618349790573, "learning_rate": 0.001, "loss": 2.0919, "num_input_tokens_seen": 58527355744, "step": 111650 }, { "epoch": 1.0656260061771825, "grad_norm": 0.14399270713329315, "learning_rate": 0.001, "loss": 2.095, "num_input_tokens_seen": 58553570144, "step": 111700 }, { "epoch": 1.066103008693371, "grad_norm": 0.14912860095500946, "learning_rate": 0.001, "loss": 2.0856, "num_input_tokens_seen": 58579784544, "step": 111750 }, { "epoch": 1.0665800112095591, "grad_norm": 0.13581617176532745, "learning_rate": 0.001, "loss": 2.0919, "num_input_tokens_seen": 58605992992, "step": 111800 }, { "epoch": 1.0670570137257473, "grad_norm": 0.1407386064529419, "learning_rate": 0.001, "loss": 2.0906, "num_input_tokens_seen": 58632200352, "step": 111850 }, { "epoch": 1.0675340162419358, "grad_norm": 0.15018606185913086, "learning_rate": 0.001, "loss": 2.0932, "num_input_tokens_seen": 58658411040, "step": 111900 }, { "epoch": 1.068011018758124, "grad_norm": 0.15312473475933075, "learning_rate": 0.001, "loss": 2.0941, "num_input_tokens_seen": 58684621696, "step": 111950 }, { "epoch": 1.0684880212743122, "grad_norm": 0.155229389667511, "learning_rate": 0.001, "loss": 2.0768, "num_input_tokens_seen": 58710836096, "step": 112000 }, { "epoch": 1.0684880212743122, "eval_loss": 2.0109803676605225, "eval_runtime": 82.3106, "eval_samples_per_second": 60.745, "eval_steps_per_second": 15.186, "num_input_tokens_seen": 58710836096, "step": 112000 }, { "epoch": 1.0689650237905004, "grad_norm": 0.13854993879795074, "learning_rate": 0.0009999921320324326, "loss": 2.0951, "num_input_tokens_seen": 58737048768, "step": 112050 }, { "epoch": 1.0694420263066888, "grad_norm": 0.15185818076133728, "learning_rate": 0.0009999685283773503, "loss": 2.0978, "num_input_tokens_seen": 58763257760, "step": 112100 }, { "epoch": 1.069919028822877, "grad_norm": 0.13926972448825836, "learning_rate": 0.000999929189777604, "loss": 2.0967, "num_input_tokens_seen": 58789471168, "step": 112150 }, { "epoch": 1.0703960313390652, "grad_norm": 0.1538979411125183, "learning_rate": 0.0009998741174712534, "loss": 2.0981, "num_input_tokens_seen": 58815685568, "step": 112200 }, { "epoch": 1.0708730338552537, "grad_norm": 0.16998735070228577, "learning_rate": 0.0009998033131915266, "loss": 2.092, "num_input_tokens_seen": 58841890880, "step": 112250 }, { "epoch": 1.0713500363714419, "grad_norm": 0.13714225590229034, "learning_rate": 0.0009997167791667668, "loss": 2.0834, "num_input_tokens_seen": 58868091424, "step": 112300 }, { "epoch": 1.07182703888763, "grad_norm": 0.1447121649980545, "learning_rate": 0.0009996145181203615, "loss": 2.1021, "num_input_tokens_seen": 58894304800, "step": 112350 }, { "epoch": 1.0723040414038185, "grad_norm": 0.14412052929401398, "learning_rate": 0.0009994965332706573, "loss": 2.0748, "num_input_tokens_seen": 58920503936, "step": 112400 }, { "epoch": 1.0727810439200067, "grad_norm": 0.14241209626197815, "learning_rate": 0.000999362828330858, "loss": 2.0932, "num_input_tokens_seen": 58946705920, "step": 112450 }, { "epoch": 1.073258046436195, "grad_norm": 0.13844363391399384, "learning_rate": 0.0009992134075089082, "loss": 2.0901, "num_input_tokens_seen": 58972918688, "step": 112500 }, { "epoch": 1.073258046436195, "eval_loss": 2.0120961666107178, "eval_runtime": 82.9684, "eval_samples_per_second": 60.264, "eval_steps_per_second": 15.066, "num_input_tokens_seen": 58972918688, "step": 112500 }, { "epoch": 1.0737350489523831, "grad_norm": 0.14107921719551086, "learning_rate": 0.0009990482755073606, "loss": 2.0982, "num_input_tokens_seen": 58999132320, "step": 112550 }, { "epoch": 1.0742120514685716, "grad_norm": 0.1476968675851822, "learning_rate": 0.000998867437523228, "loss": 2.0945, "num_input_tokens_seen": 59025346720, "step": 112600 }, { "epoch": 1.0746890539847598, "grad_norm": 0.14322301745414734, "learning_rate": 0.0009986708992478202, "loss": 2.0886, "num_input_tokens_seen": 59051559072, "step": 112650 }, { "epoch": 1.075166056500948, "grad_norm": 0.14406149089336395, "learning_rate": 0.000998458666866564, "loss": 2.0951, "num_input_tokens_seen": 59077772416, "step": 112700 }, { "epoch": 1.0756430590171364, "grad_norm": 0.14645279943943024, "learning_rate": 0.0009982307470588097, "loss": 2.0924, "num_input_tokens_seen": 59103984384, "step": 112750 }, { "epoch": 1.0761200615333246, "grad_norm": 0.14315037429332733, "learning_rate": 0.0009979871469976197, "loss": 2.0866, "num_input_tokens_seen": 59130197568, "step": 112800 }, { "epoch": 1.0765970640495128, "grad_norm": 0.15335896611213684, "learning_rate": 0.0009977278743495434, "loss": 2.0834, "num_input_tokens_seen": 59156411840, "step": 112850 }, { "epoch": 1.077074066565701, "grad_norm": 0.14293555915355682, "learning_rate": 0.0009974529372743762, "loss": 2.1005, "num_input_tokens_seen": 59182626240, "step": 112900 }, { "epoch": 1.0775510690818895, "grad_norm": 0.14281043410301208, "learning_rate": 0.000997162344424902, "loss": 2.1049, "num_input_tokens_seen": 59208840640, "step": 112950 }, { "epoch": 1.0780280715980777, "grad_norm": 0.12666131556034088, "learning_rate": 0.0009968561049466214, "loss": 2.0889, "num_input_tokens_seen": 59235047232, "step": 113000 }, { "epoch": 1.0780280715980777, "eval_loss": 2.0116124153137207, "eval_runtime": 109.4574, "eval_samples_per_second": 45.68, "eval_steps_per_second": 11.42, "num_input_tokens_seen": 59235047232, "step": 113000 }, { "epoch": 1.0785050741142659, "grad_norm": 0.14696183800697327, "learning_rate": 0.0009965342284774632, "loss": 2.084, "num_input_tokens_seen": 59261256096, "step": 113050 }, { "epoch": 1.0789820766304543, "grad_norm": 0.1535506546497345, "learning_rate": 0.0009961967251474822, "loss": 2.0905, "num_input_tokens_seen": 59287464384, "step": 113100 }, { "epoch": 1.0794590791466425, "grad_norm": 0.14321501553058624, "learning_rate": 0.000995843605578539, "loss": 2.0971, "num_input_tokens_seen": 59313669856, "step": 113150 }, { "epoch": 1.0799360816628307, "grad_norm": 0.15687337517738342, "learning_rate": 0.0009954748808839674, "loss": 2.0864, "num_input_tokens_seen": 59339879328, "step": 113200 }, { "epoch": 1.080413084179019, "grad_norm": 0.16271081566810608, "learning_rate": 0.000995090562668223, "loss": 2.0948, "num_input_tokens_seen": 59366089088, "step": 113250 }, { "epoch": 1.0808900866952074, "grad_norm": 0.14683839678764343, "learning_rate": 0.0009946906630265184, "loss": 2.105, "num_input_tokens_seen": 59392300448, "step": 113300 }, { "epoch": 1.0813670892113956, "grad_norm": 0.15148819983005524, "learning_rate": 0.0009942751945444437, "loss": 2.0814, "num_input_tokens_seen": 59418514560, "step": 113350 }, { "epoch": 1.0818440917275838, "grad_norm": 0.14587359130382538, "learning_rate": 0.0009938441702975688, "loss": 2.0943, "num_input_tokens_seen": 59444719360, "step": 113400 }, { "epoch": 1.0823210942437722, "grad_norm": 0.14699944853782654, "learning_rate": 0.0009933976038510332, "loss": 2.0927, "num_input_tokens_seen": 59470933600, "step": 113450 }, { "epoch": 1.0827980967599604, "grad_norm": 0.14229649305343628, "learning_rate": 0.0009929355092591179, "loss": 2.0985, "num_input_tokens_seen": 59497148000, "step": 113500 }, { "epoch": 1.0827980967599604, "eval_loss": 2.009983539581299, "eval_runtime": 82.6823, "eval_samples_per_second": 60.472, "eval_steps_per_second": 15.118, "num_input_tokens_seen": 59497148000, "step": 113500 }, { "epoch": 1.0832750992761486, "grad_norm": 0.14160077273845673, "learning_rate": 0.0009924579010648041, "loss": 2.0935, "num_input_tokens_seen": 59523359584, "step": 113550 }, { "epoch": 1.083752101792337, "grad_norm": 0.1411445587873459, "learning_rate": 0.0009919647942993148, "loss": 2.093, "num_input_tokens_seen": 59549569568, "step": 113600 }, { "epoch": 1.0842291043085253, "grad_norm": 0.13501347601413727, "learning_rate": 0.0009914562044816423, "loss": 2.0919, "num_input_tokens_seen": 59575783200, "step": 113650 }, { "epoch": 1.0847061068247135, "grad_norm": 0.14355099201202393, "learning_rate": 0.0009909321476180592, "loss": 2.0913, "num_input_tokens_seen": 59601990304, "step": 113700 }, { "epoch": 1.0851831093409017, "grad_norm": 0.13246339559555054, "learning_rate": 0.0009903926402016153, "loss": 2.0803, "num_input_tokens_seen": 59628197120, "step": 113750 }, { "epoch": 1.08566011185709, "grad_norm": 0.13418996334075928, "learning_rate": 0.0009898376992116178, "loss": 2.1042, "num_input_tokens_seen": 59654409856, "step": 113800 }, { "epoch": 1.0861371143732783, "grad_norm": 0.15235918760299683, "learning_rate": 0.0009892673421130977, "loss": 2.0987, "num_input_tokens_seen": 59680620096, "step": 113850 }, { "epoch": 1.0866141168894665, "grad_norm": 0.1395738422870636, "learning_rate": 0.0009886815868562597, "loss": 2.0932, "num_input_tokens_seen": 59706827264, "step": 113900 }, { "epoch": 1.087091119405655, "grad_norm": 0.1433008313179016, "learning_rate": 0.000988080451875917, "loss": 2.0943, "num_input_tokens_seen": 59733034688, "step": 113950 }, { "epoch": 1.0875681219218432, "grad_norm": 0.14490137994289398, "learning_rate": 0.0009874639560909118, "loss": 2.1012, "num_input_tokens_seen": 59759249088, "step": 114000 }, { "epoch": 1.0875681219218432, "eval_loss": 2.0104737281799316, "eval_runtime": 82.5956, "eval_samples_per_second": 60.536, "eval_steps_per_second": 15.134, "num_input_tokens_seen": 59759249088, "step": 114000 }, { "epoch": 1.0880451244380314, "grad_norm": 0.1358513981103897, "learning_rate": 0.0009868321189035196, "loss": 2.1057, "num_input_tokens_seen": 59785457920, "step": 114050 }, { "epoch": 1.0885221269542198, "grad_norm": 0.14738275110721588, "learning_rate": 0.0009861849601988384, "loss": 2.099, "num_input_tokens_seen": 59811672288, "step": 114100 }, { "epoch": 1.088999129470408, "grad_norm": 0.16324234008789062, "learning_rate": 0.0009855225003441628, "loss": 2.0952, "num_input_tokens_seen": 59837885600, "step": 114150 }, { "epoch": 1.0894761319865962, "grad_norm": 0.15156808495521545, "learning_rate": 0.0009848447601883434, "loss": 2.1014, "num_input_tokens_seen": 59864099392, "step": 114200 }, { "epoch": 1.0899531345027844, "grad_norm": 0.14273667335510254, "learning_rate": 0.0009841517610611307, "loss": 2.0898, "num_input_tokens_seen": 59890311072, "step": 114250 }, { "epoch": 1.0904301370189728, "grad_norm": 0.1409289538860321, "learning_rate": 0.0009834435247725033, "loss": 2.0798, "num_input_tokens_seen": 59916523776, "step": 114300 }, { "epoch": 1.090907139535161, "grad_norm": 0.13659177720546722, "learning_rate": 0.0009827200736119814, "loss": 2.084, "num_input_tokens_seen": 59942727744, "step": 114350 }, { "epoch": 1.0913841420513493, "grad_norm": 0.14861910045146942, "learning_rate": 0.0009819814303479266, "loss": 2.1021, "num_input_tokens_seen": 59968942144, "step": 114400 }, { "epoch": 1.0918611445675377, "grad_norm": 0.13872170448303223, "learning_rate": 0.0009812276182268236, "loss": 2.1001, "num_input_tokens_seen": 59995154848, "step": 114450 }, { "epoch": 1.092338147083726, "grad_norm": 0.14306657016277313, "learning_rate": 0.00098045866097255, "loss": 2.0837, "num_input_tokens_seen": 60021363392, "step": 114500 }, { "epoch": 1.092338147083726, "eval_loss": 2.0082569122314453, "eval_runtime": 82.8417, "eval_samples_per_second": 60.356, "eval_steps_per_second": 15.089, "num_input_tokens_seen": 60021363392, "step": 114500 }, { "epoch": 1.092815149599914, "grad_norm": 0.1300678551197052, "learning_rate": 0.000979674582785628, "loss": 2.0904, "num_input_tokens_seen": 60047570880, "step": 114550 }, { "epoch": 1.0932921521161023, "grad_norm": 0.1488349586725235, "learning_rate": 0.0009788754083424652, "loss": 2.0969, "num_input_tokens_seen": 60073778944, "step": 114600 }, { "epoch": 1.0937691546322907, "grad_norm": 0.14389395713806152, "learning_rate": 0.000978061162794576, "loss": 2.0956, "num_input_tokens_seen": 60099993344, "step": 114650 }, { "epoch": 1.094246157148479, "grad_norm": 0.13556672632694244, "learning_rate": 0.0009772318717677904, "loss": 2.0856, "num_input_tokens_seen": 60126204832, "step": 114700 }, { "epoch": 1.0947231596646672, "grad_norm": 0.14573290944099426, "learning_rate": 0.0009763875613614481, "loss": 2.083, "num_input_tokens_seen": 60152411456, "step": 114750 }, { "epoch": 1.0952001621808556, "grad_norm": 0.14349648356437683, "learning_rate": 0.0009755282581475768, "loss": 2.099, "num_input_tokens_seen": 60178616832, "step": 114800 }, { "epoch": 1.0956771646970438, "grad_norm": 0.1363336592912674, "learning_rate": 0.0009746539891700557, "loss": 2.0941, "num_input_tokens_seen": 60204821568, "step": 114850 }, { "epoch": 1.096154167213232, "grad_norm": 0.14463187754154205, "learning_rate": 0.0009737647819437645, "loss": 2.0987, "num_input_tokens_seen": 60231035968, "step": 114900 }, { "epoch": 1.0966311697294202, "grad_norm": 0.14132525026798248, "learning_rate": 0.0009728606644537177, "loss": 2.0954, "num_input_tokens_seen": 60257250368, "step": 114950 }, { "epoch": 1.0971081722456086, "grad_norm": 0.14640025794506073, "learning_rate": 0.0009719416651541838, "loss": 2.0992, "num_input_tokens_seen": 60283464768, "step": 115000 }, { "epoch": 1.0971081722456086, "eval_loss": 2.007655620574951, "eval_runtime": 82.4937, "eval_samples_per_second": 60.611, "eval_steps_per_second": 15.153, "num_input_tokens_seen": 60283464768, "step": 115000 }, { "epoch": 1.0975851747617968, "grad_norm": 0.14440514147281647, "learning_rate": 0.0009710078129677895, "loss": 2.0927, "num_input_tokens_seen": 60309676352, "step": 115050 }, { "epoch": 1.098062177277985, "grad_norm": 0.13419468700885773, "learning_rate": 0.0009700591372846095, "loss": 2.0871, "num_input_tokens_seen": 60335889280, "step": 115100 }, { "epoch": 1.0985391797941735, "grad_norm": 0.14434845745563507, "learning_rate": 0.0009690956679612422, "loss": 2.0823, "num_input_tokens_seen": 60362096256, "step": 115150 }, { "epoch": 1.0990161823103617, "grad_norm": 0.14158272743225098, "learning_rate": 0.0009681174353198686, "loss": 2.0932, "num_input_tokens_seen": 60388308192, "step": 115200 }, { "epoch": 1.09949318482655, "grad_norm": 0.1499590128660202, "learning_rate": 0.0009671244701472999, "loss": 2.0901, "num_input_tokens_seen": 60414516160, "step": 115250 }, { "epoch": 1.0999701873427383, "grad_norm": 0.13877320289611816, "learning_rate": 0.0009661168036940071, "loss": 2.0915, "num_input_tokens_seen": 60440722624, "step": 115300 }, { "epoch": 1.1004471898589265, "grad_norm": 0.14336808025836945, "learning_rate": 0.0009650944676731382, "loss": 2.0846, "num_input_tokens_seen": 60466923616, "step": 115350 }, { "epoch": 1.1009241923751147, "grad_norm": 0.16042272746562958, "learning_rate": 0.0009640574942595195, "loss": 2.0942, "num_input_tokens_seen": 60493123456, "step": 115400 }, { "epoch": 1.101401194891303, "grad_norm": 0.14399364590644836, "learning_rate": 0.0009630059160886439, "loss": 2.0988, "num_input_tokens_seen": 60519323040, "step": 115450 }, { "epoch": 1.1018781974074914, "grad_norm": 0.14042776823043823, "learning_rate": 0.0009619397662556434, "loss": 2.0916, "num_input_tokens_seen": 60545534656, "step": 115500 }, { "epoch": 1.1018781974074914, "eval_loss": 2.0105109214782715, "eval_runtime": 82.3145, "eval_samples_per_second": 60.743, "eval_steps_per_second": 15.186, "num_input_tokens_seen": 60545534656, "step": 115500 }, { "epoch": 1.1023551999236796, "grad_norm": 0.1399744153022766, "learning_rate": 0.000960859078314247, "loss": 2.096, "num_input_tokens_seen": 60571738272, "step": 115550 }, { "epoch": 1.1028322024398678, "grad_norm": 0.14161787927150726, "learning_rate": 0.0009597638862757254, "loss": 2.0916, "num_input_tokens_seen": 60597952672, "step": 115600 }, { "epoch": 1.1033092049560562, "grad_norm": 0.14088015258312225, "learning_rate": 0.0009586542246078203, "loss": 2.0856, "num_input_tokens_seen": 60624155648, "step": 115650 }, { "epoch": 1.1037862074722444, "grad_norm": 0.13098938763141632, "learning_rate": 0.00095753012823366, "loss": 2.0849, "num_input_tokens_seen": 60650370048, "step": 115700 }, { "epoch": 1.1042632099884326, "grad_norm": 0.14463865756988525, "learning_rate": 0.0009563916325306594, "loss": 2.0918, "num_input_tokens_seen": 60676580928, "step": 115750 }, { "epoch": 1.104740212504621, "grad_norm": 0.14490677416324615, "learning_rate": 0.000955238773329408, "loss": 2.0996, "num_input_tokens_seen": 60702794432, "step": 115800 }, { "epoch": 1.1052172150208093, "grad_norm": 0.14372467994689941, "learning_rate": 0.0009540715869125407, "loss": 2.09, "num_input_tokens_seen": 60729000064, "step": 115850 }, { "epoch": 1.1056942175369975, "grad_norm": 0.16468504071235657, "learning_rate": 0.000952890110013597, "loss": 2.0901, "num_input_tokens_seen": 60755212896, "step": 115900 }, { "epoch": 1.1061712200531857, "grad_norm": 0.390666663646698, "learning_rate": 0.0009516943798158648, "loss": 2.0855, "num_input_tokens_seen": 60781425984, "step": 115950 }, { "epoch": 1.1066482225693741, "grad_norm": 0.14308005571365356, "learning_rate": 0.0009504844339512095, "loss": 2.1125, "num_input_tokens_seen": 60807636160, "step": 116000 }, { "epoch": 1.1066482225693741, "eval_loss": 2.0120937824249268, "eval_runtime": 82.7927, "eval_samples_per_second": 60.392, "eval_steps_per_second": 15.098, "num_input_tokens_seen": 60807636160, "step": 116000 }, { "epoch": 1.1071252250855623, "grad_norm": 0.13944968581199646, "learning_rate": 0.0009492603104988907, "loss": 2.1028, "num_input_tokens_seen": 60833850560, "step": 116050 }, { "epoch": 1.1076022276017505, "grad_norm": 0.14454355835914612, "learning_rate": 0.0009480220479843627, "loss": 2.0995, "num_input_tokens_seen": 60860064224, "step": 116100 }, { "epoch": 1.108079230117939, "grad_norm": 0.1737418919801712, "learning_rate": 0.0009467696853780625, "loss": 2.0841, "num_input_tokens_seen": 60886278080, "step": 116150 }, { "epoch": 1.1085562326341272, "grad_norm": 0.1442703902721405, "learning_rate": 0.0009455032620941839, "loss": 2.0847, "num_input_tokens_seen": 60912488608, "step": 116200 }, { "epoch": 1.1090332351503154, "grad_norm": 0.14151588082313538, "learning_rate": 0.0009442228179894363, "loss": 2.0939, "num_input_tokens_seen": 60938699264, "step": 116250 }, { "epoch": 1.1095102376665036, "grad_norm": 0.12823954224586487, "learning_rate": 0.00094292839336179, "loss": 2.0911, "num_input_tokens_seen": 60964913664, "step": 116300 }, { "epoch": 1.109987240182692, "grad_norm": 0.1551038920879364, "learning_rate": 0.0009416200289492091, "loss": 2.0905, "num_input_tokens_seen": 60991126176, "step": 116350 }, { "epoch": 1.1104642426988802, "grad_norm": 0.14844666421413422, "learning_rate": 0.000940297765928369, "loss": 2.0853, "num_input_tokens_seen": 61017336640, "step": 116400 }, { "epoch": 1.1109412452150684, "grad_norm": 0.14786940813064575, "learning_rate": 0.0009389616459133597, "loss": 2.0948, "num_input_tokens_seen": 61043543488, "step": 116450 }, { "epoch": 1.1114182477312569, "grad_norm": 0.1404752880334854, "learning_rate": 0.0009376117109543769, "loss": 2.0889, "num_input_tokens_seen": 61069752768, "step": 116500 }, { "epoch": 1.1114182477312569, "eval_loss": 2.007530450820923, "eval_runtime": 83.3145, "eval_samples_per_second": 60.014, "eval_steps_per_second": 15.003, "num_input_tokens_seen": 61069752768, "step": 116500 }, { "epoch": 1.111895250247445, "grad_norm": 0.14887551963329315, "learning_rate": 0.0009362480035363986, "loss": 2.0906, "num_input_tokens_seen": 61095967168, "step": 116550 }, { "epoch": 1.1123722527636333, "grad_norm": 0.1436939537525177, "learning_rate": 0.0009348705665778478, "loss": 2.0857, "num_input_tokens_seen": 61122178400, "step": 116600 }, { "epoch": 1.1128492552798217, "grad_norm": 0.15015645325183868, "learning_rate": 0.0009334794434292415, "loss": 2.0877, "num_input_tokens_seen": 61148383936, "step": 116650 }, { "epoch": 1.11332625779601, "grad_norm": 0.15639320015907288, "learning_rate": 0.0009320746778718274, "loss": 2.082, "num_input_tokens_seen": 61174590560, "step": 116700 }, { "epoch": 1.1138032603121981, "grad_norm": 0.1376616209745407, "learning_rate": 0.0009306563141162046, "loss": 2.0893, "num_input_tokens_seen": 61200799104, "step": 116750 }, { "epoch": 1.1142802628283863, "grad_norm": 0.13897264003753662, "learning_rate": 0.000929224396800933, "loss": 2.0885, "num_input_tokens_seen": 61227004960, "step": 116800 }, { "epoch": 1.1147572653445748, "grad_norm": 0.16240862011909485, "learning_rate": 0.0009277789709911291, "loss": 2.0772, "num_input_tokens_seen": 61253214976, "step": 116850 }, { "epoch": 1.115234267860763, "grad_norm": 0.13620969653129578, "learning_rate": 0.0009263200821770461, "loss": 2.0815, "num_input_tokens_seen": 61279425344, "step": 116900 }, { "epoch": 1.1157112703769512, "grad_norm": 0.13625779747962952, "learning_rate": 0.0009248477762726437, "loss": 2.0834, "num_input_tokens_seen": 61305623936, "step": 116950 }, { "epoch": 1.1161882728931396, "grad_norm": 0.1379876434803009, "learning_rate": 0.0009233620996141421, "loss": 2.0879, "num_input_tokens_seen": 61331831488, "step": 117000 }, { "epoch": 1.1161882728931396, "eval_loss": 2.0054421424865723, "eval_runtime": 82.7611, "eval_samples_per_second": 60.415, "eval_steps_per_second": 15.104, "num_input_tokens_seen": 61331831488, "step": 117000 }, { "epoch": 1.1166652754093278, "grad_norm": 0.13141483068466187, "learning_rate": 0.0009218630989585645, "loss": 2.0933, "num_input_tokens_seen": 61358045888, "step": 117050 }, { "epoch": 1.117142277925516, "grad_norm": 0.14495305716991425, "learning_rate": 0.0009203508214822651, "loss": 2.0864, "num_input_tokens_seen": 61384257568, "step": 117100 }, { "epoch": 1.1176192804417044, "grad_norm": 0.14642465114593506, "learning_rate": 0.0009188253147794443, "loss": 2.0918, "num_input_tokens_seen": 61410471968, "step": 117150 }, { "epoch": 1.1180962829578927, "grad_norm": 0.13314634561538696, "learning_rate": 0.0009172866268606513, "loss": 2.0896, "num_input_tokens_seen": 61436668768, "step": 117200 }, { "epoch": 1.1185732854740809, "grad_norm": 0.15387175977230072, "learning_rate": 0.0009157348061512727, "loss": 2.0771, "num_input_tokens_seen": 61462881056, "step": 117250 }, { "epoch": 1.119050287990269, "grad_norm": 0.13886821269989014, "learning_rate": 0.0009141699014900082, "loss": 2.0945, "num_input_tokens_seen": 61489085536, "step": 117300 }, { "epoch": 1.1195272905064575, "grad_norm": 0.13939301669597626, "learning_rate": 0.0009125919621273348, "loss": 2.0918, "num_input_tokens_seen": 61515286016, "step": 117350 }, { "epoch": 1.1200042930226457, "grad_norm": 0.1996990144252777, "learning_rate": 0.0009110010377239551, "loss": 2.0859, "num_input_tokens_seen": 61541500416, "step": 117400 }, { "epoch": 1.120481295538834, "grad_norm": 0.135545015335083, "learning_rate": 0.0009093971783492354, "loss": 2.089, "num_input_tokens_seen": 61567714816, "step": 117450 }, { "epoch": 1.1209582980550223, "grad_norm": 0.1394105702638626, "learning_rate": 0.0009077804344796301, "loss": 2.0759, "num_input_tokens_seen": 61593927520, "step": 117500 }, { "epoch": 1.1209582980550223, "eval_loss": 2.003880739212036, "eval_runtime": 83.0803, "eval_samples_per_second": 60.183, "eval_steps_per_second": 15.046, "num_input_tokens_seen": 61593927520, "step": 117500 }, { "epoch": 1.1214353005712105, "grad_norm": 0.1590648591518402, "learning_rate": 0.0009061508569970925, "loss": 2.0825, "num_input_tokens_seen": 61620139072, "step": 117550 }, { "epoch": 1.1219123030873988, "grad_norm": 0.13328000903129578, "learning_rate": 0.0009045084971874737, "loss": 2.0877, "num_input_tokens_seen": 61646353472, "step": 117600 }, { "epoch": 1.122389305603587, "grad_norm": 0.13834019005298615, "learning_rate": 0.0009028534067389086, "loss": 2.0871, "num_input_tokens_seen": 61672566336, "step": 117650 }, { "epoch": 1.1228663081197754, "grad_norm": 0.13156409561634064, "learning_rate": 0.000901185637740189, "loss": 2.0906, "num_input_tokens_seen": 61698777696, "step": 117700 }, { "epoch": 1.1233433106359636, "grad_norm": 0.1528773009777069, "learning_rate": 0.0008995052426791246, "loss": 2.0731, "num_input_tokens_seen": 61724974336, "step": 117750 }, { "epoch": 1.1238203131521518, "grad_norm": 0.14865480363368988, "learning_rate": 0.0008978122744408905, "loss": 2.082, "num_input_tokens_seen": 61751177792, "step": 117800 }, { "epoch": 1.1242973156683402, "grad_norm": 0.14318804442882538, "learning_rate": 0.0008961067863063638, "loss": 2.0891, "num_input_tokens_seen": 61777391648, "step": 117850 }, { "epoch": 1.1247743181845284, "grad_norm": 0.14581789076328278, "learning_rate": 0.0008943888319504456, "loss": 2.0908, "num_input_tokens_seen": 61803602176, "step": 117900 }, { "epoch": 1.1252513207007167, "grad_norm": 0.14142882823944092, "learning_rate": 0.0008926584654403724, "loss": 2.0791, "num_input_tokens_seen": 61829816576, "step": 117950 }, { "epoch": 1.1257283232169049, "grad_norm": 0.15033917129039764, "learning_rate": 0.000890915741234015, "loss": 2.0801, "num_input_tokens_seen": 61856020192, "step": 118000 }, { "epoch": 1.1257283232169049, "eval_loss": 2.0019845962524414, "eval_runtime": 82.7188, "eval_samples_per_second": 60.446, "eval_steps_per_second": 15.111, "num_input_tokens_seen": 61856020192, "step": 118000 }, { "epoch": 1.1262053257330933, "grad_norm": 0.15097704529762268, "learning_rate": 0.0008891607141781631, "loss": 2.0857, "num_input_tokens_seen": 61882234592, "step": 118050 }, { "epoch": 1.1266823282492815, "grad_norm": 0.1383848935365677, "learning_rate": 0.0008873934395068005, "loss": 2.0858, "num_input_tokens_seen": 61908441120, "step": 118100 }, { "epoch": 1.1271593307654697, "grad_norm": 0.14688968658447266, "learning_rate": 0.0008856139728393666, "loss": 2.085, "num_input_tokens_seen": 61934653984, "step": 118150 }, { "epoch": 1.1276363332816581, "grad_norm": 0.14446312189102173, "learning_rate": 0.0008838223701790055, "loss": 2.0765, "num_input_tokens_seen": 61960867808, "step": 118200 }, { "epoch": 1.1281133357978463, "grad_norm": 0.1389646828174591, "learning_rate": 0.0008820186879108038, "loss": 2.0816, "num_input_tokens_seen": 61987070336, "step": 118250 }, { "epoch": 1.1285903383140345, "grad_norm": 0.14348453283309937, "learning_rate": 0.0008802029828000156, "loss": 2.0875, "num_input_tokens_seen": 62013276640, "step": 118300 }, { "epoch": 1.129067340830223, "grad_norm": 0.14246419072151184, "learning_rate": 0.0008783753119902765, "loss": 2.0828, "num_input_tokens_seen": 62039490144, "step": 118350 }, { "epoch": 1.1295443433464112, "grad_norm": 0.13848936557769775, "learning_rate": 0.0008765357330018055, "loss": 2.0895, "num_input_tokens_seen": 62065704544, "step": 118400 }, { "epoch": 1.1300213458625994, "grad_norm": 0.14894653856754303, "learning_rate": 0.0008746843037295936, "loss": 2.079, "num_input_tokens_seen": 62091916704, "step": 118450 }, { "epoch": 1.1304983483787878, "grad_norm": 0.1354195922613144, "learning_rate": 0.0008728210824415827, "loss": 2.0836, "num_input_tokens_seen": 62118128864, "step": 118500 }, { "epoch": 1.1304983483787878, "eval_loss": 2.004451274871826, "eval_runtime": 82.4857, "eval_samples_per_second": 60.617, "eval_steps_per_second": 15.154, "num_input_tokens_seen": 62118128864, "step": 118500 }, { "epoch": 1.130975350894976, "grad_norm": 0.14576098322868347, "learning_rate": 0.0008709461277768318, "loss": 2.0912, "num_input_tokens_seen": 62144343264, "step": 118550 }, { "epoch": 1.1314523534111642, "grad_norm": 0.14351360499858856, "learning_rate": 0.0008690594987436704, "loss": 2.0777, "num_input_tokens_seen": 62170554112, "step": 118600 }, { "epoch": 1.1319293559273524, "grad_norm": 0.14756879210472107, "learning_rate": 0.0008671612547178428, "loss": 2.0907, "num_input_tokens_seen": 62196764384, "step": 118650 }, { "epoch": 1.1324063584435409, "grad_norm": 0.15026496350765228, "learning_rate": 0.0008652514554406388, "loss": 2.0857, "num_input_tokens_seen": 62222966592, "step": 118700 }, { "epoch": 1.132883360959729, "grad_norm": 0.13817134499549866, "learning_rate": 0.0008633301610170136, "loss": 2.0851, "num_input_tokens_seen": 62249176192, "step": 118750 }, { "epoch": 1.1333603634759173, "grad_norm": 0.13346219062805176, "learning_rate": 0.0008613974319136957, "loss": 2.0856, "num_input_tokens_seen": 62275388064, "step": 118800 }, { "epoch": 1.1338373659921057, "grad_norm": 0.14300605654716492, "learning_rate": 0.0008594533289572853, "loss": 2.0835, "num_input_tokens_seen": 62301602464, "step": 118850 }, { "epoch": 1.134314368508294, "grad_norm": 0.13790345191955566, "learning_rate": 0.0008574979133323377, "loss": 2.0811, "num_input_tokens_seen": 62327812128, "step": 118900 }, { "epoch": 1.1347913710244821, "grad_norm": 0.1419474184513092, "learning_rate": 0.0008555312465794402, "loss": 2.0783, "num_input_tokens_seen": 62354024288, "step": 118950 }, { "epoch": 1.1352683735406703, "grad_norm": 0.15154699981212616, "learning_rate": 0.0008535533905932737, "loss": 2.0858, "num_input_tokens_seen": 62380238112, "step": 119000 }, { "epoch": 1.1352683735406703, "eval_loss": 2.0006425380706787, "eval_runtime": 82.1764, "eval_samples_per_second": 60.845, "eval_steps_per_second": 15.211, "num_input_tokens_seen": 62380238112, "step": 119000 }, { "epoch": 1.1357453760568588, "grad_norm": 0.1409357637166977, "learning_rate": 0.0008515644076206653, "loss": 2.0885, "num_input_tokens_seen": 62406448192, "step": 119050 }, { "epoch": 1.136222378573047, "grad_norm": 0.15409712493419647, "learning_rate": 0.0008495643602586287, "loss": 2.0778, "num_input_tokens_seen": 62432661632, "step": 119100 }, { "epoch": 1.1366993810892352, "grad_norm": 0.1327887326478958, "learning_rate": 0.0008475533114523955, "loss": 2.086, "num_input_tokens_seen": 62458870752, "step": 119150 }, { "epoch": 1.1371763836054236, "grad_norm": 0.14051629602909088, "learning_rate": 0.0008455313244934324, "loss": 2.0765, "num_input_tokens_seen": 62485082688, "step": 119200 }, { "epoch": 1.1376533861216118, "grad_norm": 0.13998936116695404, "learning_rate": 0.0008434984630174508, "loss": 2.0784, "num_input_tokens_seen": 62511288832, "step": 119250 }, { "epoch": 1.1381303886378, "grad_norm": 0.1316358745098114, "learning_rate": 0.0008414547910024035, "loss": 2.0839, "num_input_tokens_seen": 62537499648, "step": 119300 }, { "epoch": 1.1386073911539882, "grad_norm": 0.13315369188785553, "learning_rate": 0.0008394003727664709, "loss": 2.0793, "num_input_tokens_seen": 62563710336, "step": 119350 }, { "epoch": 1.1390843936701767, "grad_norm": 0.1454961597919464, "learning_rate": 0.0008373352729660373, "loss": 2.0814, "num_input_tokens_seen": 62589918400, "step": 119400 }, { "epoch": 1.1395613961863649, "grad_norm": 0.14860859513282776, "learning_rate": 0.0008352595565936554, "loss": 2.0885, "num_input_tokens_seen": 62616130880, "step": 119450 }, { "epoch": 1.140038398702553, "grad_norm": 0.13664905726909637, "learning_rate": 0.000833173288976002, "loss": 2.0836, "num_input_tokens_seen": 62642339520, "step": 119500 }, { "epoch": 1.140038398702553, "eval_loss": 1.9989631175994873, "eval_runtime": 83.3074, "eval_samples_per_second": 60.019, "eval_steps_per_second": 15.005, "num_input_tokens_seen": 62642339520, "step": 119500 }, { "epoch": 1.1405154012187415, "grad_norm": 0.1337277889251709, "learning_rate": 0.0008310765357718206, "loss": 2.0745, "num_input_tokens_seen": 62668548896, "step": 119550 }, { "epoch": 1.1409924037349297, "grad_norm": 0.13231709599494934, "learning_rate": 0.0008289693629698564, "loss": 2.0851, "num_input_tokens_seen": 62694761888, "step": 119600 }, { "epoch": 1.141469406251118, "grad_norm": 0.13446244597434998, "learning_rate": 0.0008268518368867782, "loss": 2.0737, "num_input_tokens_seen": 62720974368, "step": 119650 }, { "epoch": 1.1419464087673061, "grad_norm": 0.14359907805919647, "learning_rate": 0.0008247240241650918, "loss": 2.0772, "num_input_tokens_seen": 62747188768, "step": 119700 }, { "epoch": 1.1424234112834946, "grad_norm": 0.13156485557556152, "learning_rate": 0.0008225859917710439, "loss": 2.0791, "num_input_tokens_seen": 62773395936, "step": 119750 }, { "epoch": 1.1429004137996828, "grad_norm": 0.14039525389671326, "learning_rate": 0.000820437806992512, "loss": 2.0656, "num_input_tokens_seen": 62799610336, "step": 119800 }, { "epoch": 1.143377416315871, "grad_norm": 0.14653949439525604, "learning_rate": 0.0008182795374368893, "loss": 2.0741, "num_input_tokens_seen": 62825821984, "step": 119850 }, { "epoch": 1.1438544188320594, "grad_norm": 0.12294785678386688, "learning_rate": 0.0008161112510289549, "loss": 2.0741, "num_input_tokens_seen": 62852031840, "step": 119900 }, { "epoch": 1.1443314213482476, "grad_norm": 0.18639816343784332, "learning_rate": 0.0008139330160087374, "loss": 2.1258, "num_input_tokens_seen": 62878240576, "step": 119950 }, { "epoch": 1.1448084238644358, "grad_norm": 0.1320071518421173, "learning_rate": 0.0008117449009293668, "loss": 2.0956, "num_input_tokens_seen": 62904447680, "step": 120000 }, { "epoch": 1.1448084238644358, "eval_loss": 2.0032639503479004, "eval_runtime": 82.8531, "eval_samples_per_second": 60.348, "eval_steps_per_second": 15.087, "num_input_tokens_seen": 62904447680, "step": 120000 }, { "epoch": 1.1452854263806242, "grad_norm": 0.15100175142288208, "learning_rate": 0.0008095469746549171, "loss": 2.0793, "num_input_tokens_seen": 62930656352, "step": 120050 }, { "epoch": 1.1457624288968125, "grad_norm": 0.14095434546470642, "learning_rate": 0.0008073393063582386, "loss": 2.0828, "num_input_tokens_seen": 62956868576, "step": 120100 }, { "epoch": 1.1462394314130007, "grad_norm": 0.15013264119625092, "learning_rate": 0.0008051219655187818, "loss": 2.0711, "num_input_tokens_seen": 62983080544, "step": 120150 }, { "epoch": 1.146716433929189, "grad_norm": 0.1443673074245453, "learning_rate": 0.00080289502192041, "loss": 2.0764, "num_input_tokens_seen": 63009276608, "step": 120200 }, { "epoch": 1.1471934364453773, "grad_norm": 0.13627703487873077, "learning_rate": 0.0008006585456492029, "loss": 2.0805, "num_input_tokens_seen": 63035488032, "step": 120250 }, { "epoch": 1.1476704389615655, "grad_norm": 0.14744721353054047, "learning_rate": 0.0007984126070912518, "loss": 2.0691, "num_input_tokens_seen": 63061701600, "step": 120300 }, { "epoch": 1.1481474414777537, "grad_norm": 0.14301970601081848, "learning_rate": 0.0007961572769304437, "loss": 2.0788, "num_input_tokens_seen": 63087914624, "step": 120350 }, { "epoch": 1.1486244439939421, "grad_norm": 0.13261480629444122, "learning_rate": 0.0007938926261462366, "loss": 2.0802, "num_input_tokens_seen": 63114128096, "step": 120400 }, { "epoch": 1.1491014465101304, "grad_norm": 0.14857733249664307, "learning_rate": 0.0007916187260114262, "loss": 2.0773, "num_input_tokens_seen": 63140341024, "step": 120450 }, { "epoch": 1.1495784490263186, "grad_norm": 0.13263733685016632, "learning_rate": 0.000789335648089903, "loss": 2.0796, "num_input_tokens_seen": 63166554368, "step": 120500 }, { "epoch": 1.1495784490263186, "eval_loss": 1.9961134195327759, "eval_runtime": 82.5305, "eval_samples_per_second": 60.584, "eval_steps_per_second": 15.146, "num_input_tokens_seen": 63166554368, "step": 120500 }, { "epoch": 1.150055451542507, "grad_norm": 0.13879702985286713, "learning_rate": 0.0007870434642343984, "loss": 2.0783, "num_input_tokens_seen": 63192764288, "step": 120550 }, { "epoch": 1.1505324540586952, "grad_norm": 0.13164860010147095, "learning_rate": 0.000784742246584226, "loss": 2.081, "num_input_tokens_seen": 63218969504, "step": 120600 }, { "epoch": 1.1510094565748834, "grad_norm": 0.1406654268503189, "learning_rate": 0.0007824320675630089, "loss": 2.0704, "num_input_tokens_seen": 63245179680, "step": 120650 }, { "epoch": 1.1514864590910716, "grad_norm": 0.13722951710224152, "learning_rate": 0.0007801129998764014, "loss": 2.0693, "num_input_tokens_seen": 63271389024, "step": 120700 }, { "epoch": 1.15196346160726, "grad_norm": 0.15168820321559906, "learning_rate": 0.0007777851165098011, "loss": 2.0813, "num_input_tokens_seen": 63297594624, "step": 120750 }, { "epoch": 1.1524404641234482, "grad_norm": 0.13907547295093536, "learning_rate": 0.0007754484907260512, "loss": 2.0747, "num_input_tokens_seen": 63323809024, "step": 120800 }, { "epoch": 1.1529174666396365, "grad_norm": 0.13827022910118103, "learning_rate": 0.0007731031960631354, "loss": 2.079, "num_input_tokens_seen": 63350015808, "step": 120850 }, { "epoch": 1.1533944691558249, "grad_norm": 0.1326221376657486, "learning_rate": 0.0007707493063318629, "loss": 2.0856, "num_input_tokens_seen": 63376227968, "step": 120900 }, { "epoch": 1.153871471672013, "grad_norm": 0.13669894635677338, "learning_rate": 0.000768386895613546, "loss": 2.0691, "num_input_tokens_seen": 63402433504, "step": 120950 }, { "epoch": 1.1543484741882013, "grad_norm": 0.1403321623802185, "learning_rate": 0.0007660160382576683, "loss": 2.077, "num_input_tokens_seen": 63428647904, "step": 121000 }, { "epoch": 1.1543484741882013, "eval_loss": 1.9939944744110107, "eval_runtime": 82.7663, "eval_samples_per_second": 60.411, "eval_steps_per_second": 15.103, "num_input_tokens_seen": 63428647904, "step": 121000 }, { "epoch": 1.1548254767043895, "grad_norm": 0.1527141034603119, "learning_rate": 0.000763636808879545, "loss": 2.0812, "num_input_tokens_seen": 63454858592, "step": 121050 }, { "epoch": 1.155302479220578, "grad_norm": 0.14409616589546204, "learning_rate": 0.0007612492823579744, "loss": 2.0757, "num_input_tokens_seen": 63481069536, "step": 121100 }, { "epoch": 1.1557794817367661, "grad_norm": 0.1311630755662918, "learning_rate": 0.0007588535338328816, "loss": 2.0714, "num_input_tokens_seen": 63507276640, "step": 121150 }, { "epoch": 1.1562564842529544, "grad_norm": 0.12864112854003906, "learning_rate": 0.0007564496387029531, "loss": 2.0703, "num_input_tokens_seen": 63533491040, "step": 121200 }, { "epoch": 1.1567334867691428, "grad_norm": 0.1277550309896469, "learning_rate": 0.0007540376726232647, "loss": 2.0833, "num_input_tokens_seen": 63559699712, "step": 121250 }, { "epoch": 1.157210489285331, "grad_norm": 0.13141444325447083, "learning_rate": 0.0007516177115029001, "loss": 2.0755, "num_input_tokens_seen": 63585905408, "step": 121300 }, { "epoch": 1.1576874918015192, "grad_norm": 0.13436725735664368, "learning_rate": 0.0007491898315025615, "loss": 2.0716, "num_input_tokens_seen": 63612116704, "step": 121350 }, { "epoch": 1.1581644943177074, "grad_norm": 0.13668642938137054, "learning_rate": 0.0007467541090321735, "loss": 2.0766, "num_input_tokens_seen": 63638330048, "step": 121400 }, { "epoch": 1.1586414968338958, "grad_norm": 0.22589260339736938, "learning_rate": 0.0007443106207484776, "loss": 2.0793, "num_input_tokens_seen": 63664542944, "step": 121450 }, { "epoch": 1.159118499350084, "grad_norm": 0.14154261350631714, "learning_rate": 0.00074185944355262, "loss": 2.0938, "num_input_tokens_seen": 63690757024, "step": 121500 }, { "epoch": 1.159118499350084, "eval_loss": 1.9929685592651367, "eval_runtime": 82.8366, "eval_samples_per_second": 60.36, "eval_steps_per_second": 15.09, "num_input_tokens_seen": 63690757024, "step": 121500 }, { "epoch": 1.1595955018662722, "grad_norm": 0.13303405046463013, "learning_rate": 0.0007394006545877314, "loss": 2.078, "num_input_tokens_seen": 63716968288, "step": 121550 }, { "epoch": 1.1600725043824607, "grad_norm": 0.12762907147407532, "learning_rate": 0.0007369343312364993, "loss": 2.0757, "num_input_tokens_seen": 63743181728, "step": 121600 }, { "epoch": 1.1605495068986489, "grad_norm": 0.160507932305336, "learning_rate": 0.0007344605511187322, "loss": 2.076, "num_input_tokens_seen": 63769396128, "step": 121650 }, { "epoch": 1.161026509414837, "grad_norm": 0.14160197973251343, "learning_rate": 0.0007319793920889171, "loss": 2.0762, "num_input_tokens_seen": 63795607296, "step": 121700 }, { "epoch": 1.1615035119310255, "grad_norm": 0.15858200192451477, "learning_rate": 0.0007294909322337689, "loss": 2.08, "num_input_tokens_seen": 63821818336, "step": 121750 }, { "epoch": 1.1619805144472137, "grad_norm": 0.13940422236919403, "learning_rate": 0.0007269952498697733, "loss": 2.0816, "num_input_tokens_seen": 63848031552, "step": 121800 }, { "epoch": 1.162457516963402, "grad_norm": 0.13600219786167145, "learning_rate": 0.0007244924235407223, "loss": 2.0757, "num_input_tokens_seen": 63874245952, "step": 121850 }, { "epoch": 1.1629345194795904, "grad_norm": 0.14759120345115662, "learning_rate": 0.0007219825320152411, "loss": 2.0883, "num_input_tokens_seen": 63900453792, "step": 121900 }, { "epoch": 1.1634115219957786, "grad_norm": 0.12860442698001862, "learning_rate": 0.0007194656542843102, "loss": 2.0802, "num_input_tokens_seen": 63926661920, "step": 121950 }, { "epoch": 1.1638885245119668, "grad_norm": 0.13766394555568695, "learning_rate": 0.0007169418695587791, "loss": 2.072, "num_input_tokens_seen": 63952872768, "step": 122000 }, { "epoch": 1.1638885245119668, "eval_loss": 1.991066813468933, "eval_runtime": 82.2634, "eval_samples_per_second": 60.78, "eval_steps_per_second": 15.195, "num_input_tokens_seen": 63952872768, "step": 122000 }, { "epoch": 1.164365527028155, "grad_norm": 0.13863904774188995, "learning_rate": 0.0007144112572668733, "loss": 2.0703, "num_input_tokens_seen": 63979084224, "step": 122050 }, { "epoch": 1.1648425295443434, "grad_norm": 0.1426379680633545, "learning_rate": 0.0007118738970516943, "loss": 2.0766, "num_input_tokens_seen": 64005286944, "step": 122100 }, { "epoch": 1.1653195320605316, "grad_norm": 0.13977181911468506, "learning_rate": 0.0007093298687687141, "loss": 2.0692, "num_input_tokens_seen": 64031487744, "step": 122150 }, { "epoch": 1.1657965345767198, "grad_norm": 0.1425238400697708, "learning_rate": 0.0007067792524832604, "loss": 2.0662, "num_input_tokens_seen": 64057695552, "step": 122200 }, { "epoch": 1.1662735370929083, "grad_norm": 0.15061677992343903, "learning_rate": 0.0007042221284679982, "loss": 2.0781, "num_input_tokens_seen": 64083893664, "step": 122250 }, { "epoch": 1.1667505396090965, "grad_norm": 0.12374892085790634, "learning_rate": 0.0007016585772004026, "loss": 2.0745, "num_input_tokens_seen": 64110107392, "step": 122300 }, { "epoch": 1.1672275421252847, "grad_norm": 0.1427278071641922, "learning_rate": 0.0006990886793602267, "loss": 2.0861, "num_input_tokens_seen": 64136321792, "step": 122350 }, { "epoch": 1.1677045446414729, "grad_norm": 0.15141050517559052, "learning_rate": 0.0006965125158269618, "loss": 2.0767, "num_input_tokens_seen": 64162534656, "step": 122400 }, { "epoch": 1.1681815471576613, "grad_norm": 0.13262976706027985, "learning_rate": 0.0006939301676772927, "loss": 2.0662, "num_input_tokens_seen": 64188740064, "step": 122450 }, { "epoch": 1.1686585496738495, "grad_norm": 0.13390204310417175, "learning_rate": 0.000691341716182545, "loss": 2.0684, "num_input_tokens_seen": 64214942816, "step": 122500 }, { "epoch": 1.1686585496738495, "eval_loss": 1.9892343282699585, "eval_runtime": 81.7351, "eval_samples_per_second": 61.173, "eval_steps_per_second": 15.293, "num_input_tokens_seen": 64214942816, "step": 122500 }, { "epoch": 1.1691355521900377, "grad_norm": 0.14351387321949005, "learning_rate": 0.0006887472428061285, "loss": 2.0611, "num_input_tokens_seen": 64241151872, "step": 122550 }, { "epoch": 1.1696125547062262, "grad_norm": 0.1321556568145752, "learning_rate": 0.0006861468292009726, "loss": 2.0726, "num_input_tokens_seen": 64267354176, "step": 122600 }, { "epoch": 1.1700895572224144, "grad_norm": 0.12825502455234528, "learning_rate": 0.0006835405572069572, "loss": 2.0703, "num_input_tokens_seen": 64293568544, "step": 122650 }, { "epoch": 1.1705665597386026, "grad_norm": 0.1376345157623291, "learning_rate": 0.0006809285088483361, "loss": 2.0789, "num_input_tokens_seen": 64319782944, "step": 122700 }, { "epoch": 1.1710435622547908, "grad_norm": 0.14178837835788727, "learning_rate": 0.0006783107663311565, "loss": 2.0755, "num_input_tokens_seen": 64345996064, "step": 122750 }, { "epoch": 1.1715205647709792, "grad_norm": 0.1475340873003006, "learning_rate": 0.0006756874120406714, "loss": 2.0668, "num_input_tokens_seen": 64372202944, "step": 122800 }, { "epoch": 1.1719975672871674, "grad_norm": 0.13012921810150146, "learning_rate": 0.0006730585285387465, "loss": 2.0618, "num_input_tokens_seen": 64398414944, "step": 122850 }, { "epoch": 1.1724745698033556, "grad_norm": 0.13203522562980652, "learning_rate": 0.0006704241985612625, "loss": 2.0712, "num_input_tokens_seen": 64424627264, "step": 122900 }, { "epoch": 1.172951572319544, "grad_norm": 0.13648848235607147, "learning_rate": 0.0006677845050155106, "loss": 2.0694, "num_input_tokens_seen": 64450839392, "step": 122950 }, { "epoch": 1.1734285748357323, "grad_norm": 0.1383182257413864, "learning_rate": 0.0006651395309775837, "loss": 2.0564, "num_input_tokens_seen": 64477051392, "step": 123000 }, { "epoch": 1.1734285748357323, "eval_loss": 1.9881237745285034, "eval_runtime": 82.9953, "eval_samples_per_second": 60.244, "eval_steps_per_second": 15.061, "num_input_tokens_seen": 64477051392, "step": 123000 }, { "epoch": 1.1739055773519205, "grad_norm": 0.14069771766662598, "learning_rate": 0.0006624893596897613, "loss": 2.0767, "num_input_tokens_seen": 64503259872, "step": 123050 }, { "epoch": 1.174382579868109, "grad_norm": 0.14180107414722443, "learning_rate": 0.0006598340745578908, "loss": 2.0611, "num_input_tokens_seen": 64529460896, "step": 123100 }, { "epoch": 1.174859582384297, "grad_norm": 0.14584094285964966, "learning_rate": 0.000657173759148761, "loss": 2.0693, "num_input_tokens_seen": 64555675296, "step": 123150 }, { "epoch": 1.1753365849004853, "grad_norm": 0.1269799768924713, "learning_rate": 0.0006545084971874737, "loss": 2.0615, "num_input_tokens_seen": 64581882720, "step": 123200 }, { "epoch": 1.1758135874166737, "grad_norm": 0.15073458850383759, "learning_rate": 0.0006518383725548074, "loss": 2.083, "num_input_tokens_seen": 64608088736, "step": 123250 }, { "epoch": 1.176290589932862, "grad_norm": 0.12902715802192688, "learning_rate": 0.000649163469284578, "loss": 2.0579, "num_input_tokens_seen": 64634299936, "step": 123300 }, { "epoch": 1.1767675924490502, "grad_norm": 0.13666096329689026, "learning_rate": 0.0006464838715609945, "loss": 2.0673, "num_input_tokens_seen": 64660511904, "step": 123350 }, { "epoch": 1.1772445949652384, "grad_norm": 0.13477379083633423, "learning_rate": 0.0006437996637160086, "loss": 2.0752, "num_input_tokens_seen": 64686718272, "step": 123400 }, { "epoch": 1.1777215974814268, "grad_norm": 0.13596594333648682, "learning_rate": 0.0006411109302266615, "loss": 2.0606, "num_input_tokens_seen": 64712932256, "step": 123450 }, { "epoch": 1.178198599997615, "grad_norm": 0.1400011032819748, "learning_rate": 0.0006384177557124247, "loss": 2.066, "num_input_tokens_seen": 64739145440, "step": 123500 }, { "epoch": 1.178198599997615, "eval_loss": 1.986546516418457, "eval_runtime": 82.7963, "eval_samples_per_second": 60.389, "eval_steps_per_second": 15.097, "num_input_tokens_seen": 64739145440, "step": 123500 }, { "epoch": 1.1786756025138032, "grad_norm": 0.13023069500923157, "learning_rate": 0.0006357202249325371, "loss": 2.0727, "num_input_tokens_seen": 64765359840, "step": 123550 }, { "epoch": 1.1791526050299916, "grad_norm": 0.13744056224822998, "learning_rate": 0.0006330184227833376, "loss": 2.0603, "num_input_tokens_seen": 64791573504, "step": 123600 }, { "epoch": 1.1796296075461798, "grad_norm": 0.1399419903755188, "learning_rate": 0.0006303124342955927, "loss": 2.0699, "num_input_tokens_seen": 64817787904, "step": 123650 }, { "epoch": 1.180106610062368, "grad_norm": 0.13453304767608643, "learning_rate": 0.0006276023446318213, "loss": 2.0764, "num_input_tokens_seen": 64844002304, "step": 123700 }, { "epoch": 1.1805836125785563, "grad_norm": 0.13495005667209625, "learning_rate": 0.0006248882390836135, "loss": 2.0629, "num_input_tokens_seen": 64870216704, "step": 123750 }, { "epoch": 1.1810606150947447, "grad_norm": 0.14330346882343292, "learning_rate": 0.000622170203068947, "loss": 2.0677, "num_input_tokens_seen": 64896426784, "step": 123800 }, { "epoch": 1.181537617610933, "grad_norm": 0.13179130852222443, "learning_rate": 0.0006194483221294988, "loss": 2.0568, "num_input_tokens_seen": 64922636000, "step": 123850 }, { "epoch": 1.182014620127121, "grad_norm": 0.12518762052059174, "learning_rate": 0.0006167226819279528, "loss": 2.0604, "num_input_tokens_seen": 64948840416, "step": 123900 }, { "epoch": 1.1824916226433095, "grad_norm": 0.12823528051376343, "learning_rate": 0.0006139933682453035, "loss": 2.0683, "num_input_tokens_seen": 64975054816, "step": 123950 }, { "epoch": 1.1829686251594977, "grad_norm": 0.1308305859565735, "learning_rate": 0.0006112604669781572, "loss": 2.0639, "num_input_tokens_seen": 65001257824, "step": 124000 }, { "epoch": 1.1829686251594977, "eval_loss": 1.9843353033065796, "eval_runtime": 82.7751, "eval_samples_per_second": 60.405, "eval_steps_per_second": 15.101, "num_input_tokens_seen": 65001257824, "step": 124000 }, { "epoch": 1.183445627675686, "grad_norm": 0.12966303527355194, "learning_rate": 0.0006085240641360281, "loss": 2.0655, "num_input_tokens_seen": 65027466432, "step": 124050 }, { "epoch": 1.1839226301918742, "grad_norm": 0.13216206431388855, "learning_rate": 0.0006057842458386314, "loss": 2.0787, "num_input_tokens_seen": 65053680192, "step": 124100 }, { "epoch": 1.1843996327080626, "grad_norm": 0.13295891880989075, "learning_rate": 0.0006030410983131733, "loss": 2.0654, "num_input_tokens_seen": 65079892928, "step": 124150 }, { "epoch": 1.1848766352242508, "grad_norm": 0.14478819072246552, "learning_rate": 0.0006002947078916364, "loss": 2.0638, "num_input_tokens_seen": 65106107328, "step": 124200 }, { "epoch": 1.185353637740439, "grad_norm": 0.13410045206546783, "learning_rate": 0.0005975451610080642, "loss": 2.0711, "num_input_tokens_seen": 65132321728, "step": 124250 }, { "epoch": 1.1858306402566274, "grad_norm": 0.14699777960777283, "learning_rate": 0.0005947925441958392, "loss": 2.0574, "num_input_tokens_seen": 65158534656, "step": 124300 }, { "epoch": 1.1863076427728156, "grad_norm": 0.13368327915668488, "learning_rate": 0.0005920369440849609, "loss": 2.0626, "num_input_tokens_seen": 65184748736, "step": 124350 }, { "epoch": 1.1867846452890038, "grad_norm": 0.13047395646572113, "learning_rate": 0.0005892784473993184, "loss": 2.06, "num_input_tokens_seen": 65210950912, "step": 124400 }, { "epoch": 1.187261647805192, "grad_norm": 0.13072432577610016, "learning_rate": 0.0005865171409539613, "loss": 2.0869, "num_input_tokens_seen": 65237165312, "step": 124450 }, { "epoch": 1.1877386503213805, "grad_norm": 0.14443765580654144, "learning_rate": 0.0005837531116523682, "loss": 2.0675, "num_input_tokens_seen": 65263378112, "step": 124500 }, { "epoch": 1.1877386503213805, "eval_loss": 1.9832085371017456, "eval_runtime": 83.5278, "eval_samples_per_second": 59.86, "eval_steps_per_second": 14.965, "num_input_tokens_seen": 65263378112, "step": 124500 }, { "epoch": 1.1882156528375687, "grad_norm": 0.13271184265613556, "learning_rate": 0.0005809864464837105, "loss": 2.0507, "num_input_tokens_seen": 65289588448, "step": 124550 }, { "epoch": 1.188692655353757, "grad_norm": 0.13720299303531647, "learning_rate": 0.0005782172325201155, "loss": 2.0728, "num_input_tokens_seen": 65315802432, "step": 124600 }, { "epoch": 1.1891696578699453, "grad_norm": 0.12747812271118164, "learning_rate": 0.0005754455569139257, "loss": 2.0786, "num_input_tokens_seen": 65342011648, "step": 124650 }, { "epoch": 1.1896466603861335, "grad_norm": 0.13649390637874603, "learning_rate": 0.0005726715068949564, "loss": 2.0578, "num_input_tokens_seen": 65368225184, "step": 124700 }, { "epoch": 1.1901236629023217, "grad_norm": 0.13283640146255493, "learning_rate": 0.0005698951697677498, "loss": 2.0616, "num_input_tokens_seen": 65394434464, "step": 124750 }, { "epoch": 1.1906006654185102, "grad_norm": 0.13304251432418823, "learning_rate": 0.0005671166329088278, "loss": 2.0657, "num_input_tokens_seen": 65420648864, "step": 124800 }, { "epoch": 1.1910776679346984, "grad_norm": 0.1442023664712906, "learning_rate": 0.000564335983763942, "loss": 2.0584, "num_input_tokens_seen": 65446854944, "step": 124850 }, { "epoch": 1.1915546704508866, "grad_norm": 0.13637055456638336, "learning_rate": 0.0005615533098453215, "loss": 2.0719, "num_input_tokens_seen": 65473067296, "step": 124900 }, { "epoch": 1.192031672967075, "grad_norm": 0.13165481388568878, "learning_rate": 0.0005587686987289189, "loss": 2.0594, "num_input_tokens_seen": 65499281184, "step": 124950 }, { "epoch": 1.1925086754832632, "grad_norm": 0.14200669527053833, "learning_rate": 0.0005559822380516539, "loss": 2.0692, "num_input_tokens_seen": 65525493280, "step": 125000 }, { "epoch": 1.1925086754832632, "eval_loss": 1.982203722000122, "eval_runtime": 82.3332, "eval_samples_per_second": 60.729, "eval_steps_per_second": 15.182, "num_input_tokens_seen": 65525493280, "step": 125000 }, { "epoch": 1.1929856779994514, "grad_norm": 0.13272584974765778, "learning_rate": 0.0005531940155086557, "loss": 2.0602, "num_input_tokens_seen": 65551700064, "step": 125050 }, { "epoch": 1.1934626805156396, "grad_norm": 0.14066773653030396, "learning_rate": 0.0005504041188505022, "loss": 2.0695, "num_input_tokens_seen": 65577910784, "step": 125100 }, { "epoch": 1.193939683031828, "grad_norm": 0.13133113086223602, "learning_rate": 0.0005476126358804593, "loss": 2.0686, "num_input_tokens_seen": 65604124224, "step": 125150 }, { "epoch": 1.1944166855480163, "grad_norm": 0.13990654051303864, "learning_rate": 0.0005448196544517168, "loss": 2.0532, "num_input_tokens_seen": 65630324960, "step": 125200 }, { "epoch": 1.1948936880642045, "grad_norm": 0.14154765009880066, "learning_rate": 0.0005420252624646238, "loss": 2.0518, "num_input_tokens_seen": 65656532992, "step": 125250 }, { "epoch": 1.195370690580393, "grad_norm": 0.13149969279766083, "learning_rate": 0.0005392295478639225, "loss": 2.0619, "num_input_tokens_seen": 65682736768, "step": 125300 }, { "epoch": 1.1958476930965811, "grad_norm": 0.1339765191078186, "learning_rate": 0.0005364325986359802, "loss": 2.0706, "num_input_tokens_seen": 65708951168, "step": 125350 }, { "epoch": 1.1963246956127693, "grad_norm": 0.13910150527954102, "learning_rate": 0.0005336345028060199, "loss": 2.0596, "num_input_tokens_seen": 65735165568, "step": 125400 }, { "epoch": 1.1968016981289575, "grad_norm": 0.1447630077600479, "learning_rate": 0.0005308353484353508, "loss": 2.0518, "num_input_tokens_seen": 65761369888, "step": 125450 }, { "epoch": 1.197278700645146, "grad_norm": 0.13201679289340973, "learning_rate": 0.0005280352236185959, "loss": 2.0645, "num_input_tokens_seen": 65787582144, "step": 125500 }, { "epoch": 1.197278700645146, "eval_loss": 1.9799100160598755, "eval_runtime": 83.01, "eval_samples_per_second": 60.234, "eval_steps_per_second": 15.058, "num_input_tokens_seen": 65787582144, "step": 125500 }, { "epoch": 1.1977557031613342, "grad_norm": 0.1335040032863617, "learning_rate": 0.0005252342164809204, "loss": 2.0597, "num_input_tokens_seen": 65813796352, "step": 125550 }, { "epoch": 1.1982327056775224, "grad_norm": 0.13693130016326904, "learning_rate": 0.0005224324151752575, "loss": 2.0594, "num_input_tokens_seen": 65840010208, "step": 125600 }, { "epoch": 1.1987097081937108, "grad_norm": 0.13866880536079407, "learning_rate": 0.0005196299078795343, "loss": 2.0511, "num_input_tokens_seen": 65866216672, "step": 125650 }, { "epoch": 1.199186710709899, "grad_norm": 0.12740108370780945, "learning_rate": 0.000516826782793897, "loss": 2.0607, "num_input_tokens_seen": 65892430944, "step": 125700 }, { "epoch": 1.1996637132260872, "grad_norm": 0.13575108349323273, "learning_rate": 0.0005140231281379345, "loss": 2.0555, "num_input_tokens_seen": 65918642496, "step": 125750 }, { "epoch": 1.2001407157422754, "grad_norm": 0.13791455328464508, "learning_rate": 0.0005112190321479025, "loss": 2.0632, "num_input_tokens_seen": 65944852960, "step": 125800 }, { "epoch": 1.2006177182584639, "grad_norm": 0.1315431296825409, "learning_rate": 0.0005084145830739461, "loss": 2.0646, "num_input_tokens_seen": 65971066432, "step": 125850 }, { "epoch": 1.201094720774652, "grad_norm": 0.12288303673267365, "learning_rate": 0.000505609869177323, "loss": 2.0748, "num_input_tokens_seen": 65997277888, "step": 125900 }, { "epoch": 1.2015717232908403, "grad_norm": 0.12677106261253357, "learning_rate": 0.0005028049787276249, "loss": 2.0595, "num_input_tokens_seen": 66023480960, "step": 125950 }, { "epoch": 1.2020487258070287, "grad_norm": 0.140994593501091, "learning_rate": 0.0005, "loss": 2.0556, "num_input_tokens_seen": 66049692768, "step": 126000 }, { "epoch": 1.2020487258070287, "eval_loss": 1.978381633758545, "eval_runtime": 81.8164, "eval_samples_per_second": 61.112, "eval_steps_per_second": 15.278, "num_input_tokens_seen": 66049692768, "step": 126000 }, { "epoch": 1.202525728323217, "grad_norm": 0.1393454372882843, "learning_rate": 0.0004971950212723752, "loss": 2.0569, "num_input_tokens_seen": 66075907072, "step": 126050 }, { "epoch": 1.2030027308394051, "grad_norm": 0.1390795111656189, "learning_rate": 0.0004943901308226771, "loss": 2.0579, "num_input_tokens_seen": 66102120320, "step": 126100 }, { "epoch": 1.2034797333555933, "grad_norm": 0.136804461479187, "learning_rate": 0.0004915854169260539, "loss": 2.0594, "num_input_tokens_seen": 66128330880, "step": 126150 }, { "epoch": 1.2039567358717818, "grad_norm": 0.14418946206569672, "learning_rate": 0.0004887809678520976, "loss": 2.0521, "num_input_tokens_seen": 66154537216, "step": 126200 }, { "epoch": 1.20443373838797, "grad_norm": 0.1406649798154831, "learning_rate": 0.00048597687186206556, "loss": 2.0604, "num_input_tokens_seen": 66180744192, "step": 126250 }, { "epoch": 1.2049107409041582, "grad_norm": 0.13004782795906067, "learning_rate": 0.0004831732172061032, "loss": 2.0633, "num_input_tokens_seen": 66206951232, "step": 126300 }, { "epoch": 1.2053877434203466, "grad_norm": 0.1319655478000641, "learning_rate": 0.00048037009212046586, "loss": 2.0609, "num_input_tokens_seen": 66233151744, "step": 126350 }, { "epoch": 1.2058647459365348, "grad_norm": 0.13051386177539825, "learning_rate": 0.0004775675848247427, "loss": 2.0591, "num_input_tokens_seen": 66259358592, "step": 126400 }, { "epoch": 1.206341748452723, "grad_norm": 0.12983474135398865, "learning_rate": 0.0004747657835190795, "loss": 2.0571, "num_input_tokens_seen": 66285559520, "step": 126450 }, { "epoch": 1.2068187509689114, "grad_norm": 0.12744031846523285, "learning_rate": 0.00047196477638140405, "loss": 2.0581, "num_input_tokens_seen": 66311770112, "step": 126500 }, { "epoch": 1.2068187509689114, "eval_loss": 1.9767038822174072, "eval_runtime": 82.0094, "eval_samples_per_second": 60.969, "eval_steps_per_second": 15.242, "num_input_tokens_seen": 66311770112, "step": 126500 }, { "epoch": 1.2072957534850997, "grad_norm": 0.13606679439544678, "learning_rate": 0.00046916465156464924, "loss": 2.062, "num_input_tokens_seen": 66337979200, "step": 126550 }, { "epoch": 1.2077727560012879, "grad_norm": 0.12876896560192108, "learning_rate": 0.0004663654971939802, "loss": 2.0627, "num_input_tokens_seen": 66364192640, "step": 126600 }, { "epoch": 1.2082497585174763, "grad_norm": 0.18826884031295776, "learning_rate": 0.00046356740136402, "loss": 2.0573, "num_input_tokens_seen": 66390404768, "step": 126650 }, { "epoch": 1.2087267610336645, "grad_norm": 0.1488431692123413, "learning_rate": 0.0004607704521360776, "loss": 2.0592, "num_input_tokens_seen": 66416613920, "step": 126700 }, { "epoch": 1.2092037635498527, "grad_norm": 0.12901978194713593, "learning_rate": 0.0004579747375353763, "loss": 2.0601, "num_input_tokens_seen": 66442820800, "step": 126750 }, { "epoch": 1.209680766066041, "grad_norm": 0.13032038509845734, "learning_rate": 0.0004551803455482833, "loss": 2.0675, "num_input_tokens_seen": 66469028480, "step": 126800 }, { "epoch": 1.2101577685822293, "grad_norm": 0.13756315410137177, "learning_rate": 0.00045238736411954073, "loss": 2.0543, "num_input_tokens_seen": 66495230816, "step": 126850 }, { "epoch": 1.2106347710984176, "grad_norm": 0.13066066801548004, "learning_rate": 0.0004495958811494978, "loss": 2.0545, "num_input_tokens_seen": 66521443360, "step": 126900 }, { "epoch": 1.2111117736146058, "grad_norm": 0.13837099075317383, "learning_rate": 0.00044680598449134434, "loss": 2.0557, "num_input_tokens_seen": 66547651488, "step": 126950 }, { "epoch": 1.2115887761307942, "grad_norm": 0.13125094771385193, "learning_rate": 0.0004440177619483461, "loss": 2.0633, "num_input_tokens_seen": 66573856704, "step": 127000 }, { "epoch": 1.2115887761307942, "eval_loss": 1.9741461277008057, "eval_runtime": 82.3333, "eval_samples_per_second": 60.729, "eval_steps_per_second": 15.182, "num_input_tokens_seen": 66573856704, "step": 127000 }, { "epoch": 1.2120657786469824, "grad_norm": 0.13154049217700958, "learning_rate": 0.00044123130127108126, "loss": 2.0525, "num_input_tokens_seen": 66600067712, "step": 127050 }, { "epoch": 1.2125427811631706, "grad_norm": 0.13129626214504242, "learning_rate": 0.00043844669015467863, "loss": 2.0411, "num_input_tokens_seen": 66626274400, "step": 127100 }, { "epoch": 1.2130197836793588, "grad_norm": 0.12721647322177887, "learning_rate": 0.0004356640162360581, "loss": 2.0469, "num_input_tokens_seen": 66652487040, "step": 127150 }, { "epoch": 1.2134967861955472, "grad_norm": 0.1383296549320221, "learning_rate": 0.0004328833670911724, "loss": 2.0578, "num_input_tokens_seen": 66678700288, "step": 127200 }, { "epoch": 1.2139737887117354, "grad_norm": 0.12966816127300262, "learning_rate": 0.00043010483023225046, "loss": 2.0544, "num_input_tokens_seen": 66704910336, "step": 127250 }, { "epoch": 1.2144507912279237, "grad_norm": 0.13144998252391815, "learning_rate": 0.0004273284931050438, "loss": 2.061, "num_input_tokens_seen": 66731122112, "step": 127300 }, { "epoch": 1.214927793744112, "grad_norm": 0.13422222435474396, "learning_rate": 0.0004245544430860743, "loss": 2.062, "num_input_tokens_seen": 66757331872, "step": 127350 }, { "epoch": 1.2154047962603003, "grad_norm": 0.1333204060792923, "learning_rate": 0.0004217827674798845, "loss": 2.0538, "num_input_tokens_seen": 66783545248, "step": 127400 }, { "epoch": 1.2158817987764885, "grad_norm": 0.13239559531211853, "learning_rate": 0.0004190135535162894, "loss": 2.0545, "num_input_tokens_seen": 66809758656, "step": 127450 }, { "epoch": 1.2163588012926767, "grad_norm": 0.13535359501838684, "learning_rate": 0.00041624688834763184, "loss": 2.0625, "num_input_tokens_seen": 66835970592, "step": 127500 }, { "epoch": 1.2163588012926767, "eval_loss": 1.9728902578353882, "eval_runtime": 82.272, "eval_samples_per_second": 60.774, "eval_steps_per_second": 15.194, "num_input_tokens_seen": 66835970592, "step": 127500 }, { "epoch": 1.2168358038088651, "grad_norm": 0.1306886225938797, "learning_rate": 0.0004134828590460387, "loss": 2.0548, "num_input_tokens_seen": 66862174016, "step": 127550 }, { "epoch": 1.2173128063250533, "grad_norm": 0.1322244554758072, "learning_rate": 0.0004107215526006817, "loss": 2.0544, "num_input_tokens_seen": 66888384224, "step": 127600 }, { "epoch": 1.2177898088412416, "grad_norm": 0.13241881132125854, "learning_rate": 0.0004079630559150391, "loss": 2.0646, "num_input_tokens_seen": 66914597888, "step": 127650 }, { "epoch": 1.21826681135743, "grad_norm": 0.12745130062103271, "learning_rate": 0.0004052074558041608, "loss": 2.0554, "num_input_tokens_seen": 66940807552, "step": 127700 }, { "epoch": 1.2187438138736182, "grad_norm": 0.13167862594127655, "learning_rate": 0.00040245483899193594, "loss": 2.0449, "num_input_tokens_seen": 66967017376, "step": 127750 }, { "epoch": 1.2192208163898064, "grad_norm": 0.1641312688589096, "learning_rate": 0.00039970529210836363, "loss": 2.0438, "num_input_tokens_seen": 66993229600, "step": 127800 }, { "epoch": 1.2196978189059948, "grad_norm": 0.1290162205696106, "learning_rate": 0.00039695890168682686, "loss": 2.0633, "num_input_tokens_seen": 67019433984, "step": 127850 }, { "epoch": 1.220174821422183, "grad_norm": 0.12822365760803223, "learning_rate": 0.0003942157541613686, "loss": 2.0477, "num_input_tokens_seen": 67045643168, "step": 127900 }, { "epoch": 1.2206518239383712, "grad_norm": 0.13961108028888702, "learning_rate": 0.0003914759358639719, "loss": 2.063, "num_input_tokens_seen": 67071854592, "step": 127950 }, { "epoch": 1.2211288264545597, "grad_norm": 0.13082347810268402, "learning_rate": 0.00038873953302184284, "loss": 2.0557, "num_input_tokens_seen": 67098059328, "step": 128000 }, { "epoch": 1.2211288264545597, "eval_loss": 1.9715449810028076, "eval_runtime": 83.7065, "eval_samples_per_second": 59.733, "eval_steps_per_second": 14.933, "num_input_tokens_seen": 67098059328, "step": 128000 }, { "epoch": 1.2216058289707479, "grad_norm": 0.13075117766857147, "learning_rate": 0.00038600663175469667, "loss": 2.0582, "num_input_tokens_seen": 67124264448, "step": 128050 }, { "epoch": 1.222082831486936, "grad_norm": 0.1297282576560974, "learning_rate": 0.00038327731807204744, "loss": 2.0595, "num_input_tokens_seen": 67150472320, "step": 128100 }, { "epoch": 1.2225598340031243, "grad_norm": 0.12640318274497986, "learning_rate": 0.00038055167787050134, "loss": 2.0525, "num_input_tokens_seen": 67176672192, "step": 128150 }, { "epoch": 1.2230368365193127, "grad_norm": 0.1315733790397644, "learning_rate": 0.00037782979693105293, "loss": 2.0499, "num_input_tokens_seen": 67202877408, "step": 128200 }, { "epoch": 1.223513839035501, "grad_norm": 0.12865200638771057, "learning_rate": 0.0003751117609163865, "loss": 2.051, "num_input_tokens_seen": 67229091168, "step": 128250 }, { "epoch": 1.2239908415516891, "grad_norm": 0.1271800547838211, "learning_rate": 0.00037239765536817873, "loss": 2.0555, "num_input_tokens_seen": 67255304768, "step": 128300 }, { "epoch": 1.2244678440678776, "grad_norm": 0.13572408258914948, "learning_rate": 0.0003696875657044073, "loss": 2.0622, "num_input_tokens_seen": 67281509184, "step": 128350 }, { "epoch": 1.2249448465840658, "grad_norm": 0.12558363378047943, "learning_rate": 0.0003669815772166625, "loss": 2.0548, "num_input_tokens_seen": 67307717088, "step": 128400 }, { "epoch": 1.225421849100254, "grad_norm": 0.13062912225723267, "learning_rate": 0.0003642797750674629, "loss": 2.0473, "num_input_tokens_seen": 67333928800, "step": 128450 }, { "epoch": 1.2258988516164422, "grad_norm": 0.1351100355386734, "learning_rate": 0.00036158224428757535, "loss": 2.0475, "num_input_tokens_seen": 67360131616, "step": 128500 }, { "epoch": 1.2258988516164422, "eval_loss": 1.9701597690582275, "eval_runtime": 82.4081, "eval_samples_per_second": 60.674, "eval_steps_per_second": 15.168, "num_input_tokens_seen": 67360131616, "step": 128500 }, { "epoch": 1.2263758541326306, "grad_norm": 0.13211333751678467, "learning_rate": 0.00035888906977333857, "loss": 2.0622, "num_input_tokens_seen": 67386344736, "step": 128550 }, { "epoch": 1.2268528566488188, "grad_norm": 0.12648384273052216, "learning_rate": 0.0003562003362839914, "loss": 2.051, "num_input_tokens_seen": 67412555520, "step": 128600 }, { "epoch": 1.227329859165007, "grad_norm": 0.13109999895095825, "learning_rate": 0.00035351612843900553, "loss": 2.0529, "num_input_tokens_seen": 67438769504, "step": 128650 }, { "epoch": 1.2278068616811955, "grad_norm": 0.12981992959976196, "learning_rate": 0.000350836530715422, "loss": 2.045, "num_input_tokens_seen": 67464972864, "step": 128700 }, { "epoch": 1.2282838641973837, "grad_norm": 0.1246839389204979, "learning_rate": 0.00034816162744519263, "loss": 2.0569, "num_input_tokens_seen": 67491186176, "step": 128750 }, { "epoch": 1.2287608667135719, "grad_norm": 0.13077682256698608, "learning_rate": 0.00034549150281252633, "loss": 2.0461, "num_input_tokens_seen": 67517399168, "step": 128800 }, { "epoch": 1.22923786922976, "grad_norm": 0.12939219176769257, "learning_rate": 0.000342826240851239, "loss": 2.047, "num_input_tokens_seen": 67543606592, "step": 128850 }, { "epoch": 1.2297148717459485, "grad_norm": 0.12711487710475922, "learning_rate": 0.00034016592544210936, "loss": 2.0411, "num_input_tokens_seen": 67569807488, "step": 128900 }, { "epoch": 1.2301918742621367, "grad_norm": 0.13154172897338867, "learning_rate": 0.00033751064031023887, "loss": 2.0536, "num_input_tokens_seen": 67596020896, "step": 128950 }, { "epoch": 1.230668876778325, "grad_norm": 0.1312495321035385, "learning_rate": 0.00033486046902241664, "loss": 2.0558, "num_input_tokens_seen": 67622231264, "step": 129000 }, { "epoch": 1.230668876778325, "eval_loss": 1.9686726331710815, "eval_runtime": 82.3322, "eval_samples_per_second": 60.73, "eval_steps_per_second": 15.182, "num_input_tokens_seen": 67622231264, "step": 129000 }, { "epoch": 1.2311458792945134, "grad_norm": 0.13078469038009644, "learning_rate": 0.00033221549498448967, "loss": 2.0474, "num_input_tokens_seen": 67648445664, "step": 129050 }, { "epoch": 1.2316228818107016, "grad_norm": 0.1259986162185669, "learning_rate": 0.0003295758014387375, "loss": 2.0605, "num_input_tokens_seen": 67674654432, "step": 129100 }, { "epoch": 1.2320998843268898, "grad_norm": 0.13479039072990417, "learning_rate": 0.0003269414714612534, "loss": 2.0499, "num_input_tokens_seen": 67700854208, "step": 129150 }, { "epoch": 1.232576886843078, "grad_norm": 0.12382933497428894, "learning_rate": 0.0003243125879593286, "loss": 2.0403, "num_input_tokens_seen": 67727067232, "step": 129200 }, { "epoch": 1.2330538893592664, "grad_norm": 0.13765262067317963, "learning_rate": 0.0003216892336688435, "loss": 2.05, "num_input_tokens_seen": 67753274144, "step": 129250 }, { "epoch": 1.2335308918754546, "grad_norm": 0.13626757264137268, "learning_rate": 0.000319071491151664, "loss": 2.0533, "num_input_tokens_seen": 67779485312, "step": 129300 }, { "epoch": 1.2340078943916428, "grad_norm": 0.13541923463344574, "learning_rate": 0.00031645944279304295, "loss": 2.0502, "num_input_tokens_seen": 67805697216, "step": 129350 }, { "epoch": 1.2344848969078313, "grad_norm": 0.12669889628887177, "learning_rate": 0.00031385317079902743, "loss": 2.0434, "num_input_tokens_seen": 67831908160, "step": 129400 }, { "epoch": 1.2349618994240195, "grad_norm": 0.12400075793266296, "learning_rate": 0.0003112527571938717, "loss": 2.0556, "num_input_tokens_seen": 67858116736, "step": 129450 }, { "epoch": 1.2354389019402077, "grad_norm": 0.13263045251369476, "learning_rate": 0.0003086582838174551, "loss": 2.0405, "num_input_tokens_seen": 67884327168, "step": 129500 }, { "epoch": 1.2354389019402077, "eval_loss": 1.966764211654663, "eval_runtime": 82.4836, "eval_samples_per_second": 60.618, "eval_steps_per_second": 15.155, "num_input_tokens_seen": 67884327168, "step": 129500 }, { "epoch": 1.235915904456396, "grad_norm": 0.12067709863185883, "learning_rate": 0.00030606983232270746, "loss": 2.0511, "num_input_tokens_seen": 67910538880, "step": 129550 }, { "epoch": 1.2363929069725843, "grad_norm": 0.13021409511566162, "learning_rate": 0.0003034874841730382, "loss": 2.0525, "num_input_tokens_seen": 67936753280, "step": 129600 }, { "epoch": 1.2368699094887725, "grad_norm": 0.12661676108837128, "learning_rate": 0.0003009113206397734, "loss": 2.0575, "num_input_tokens_seen": 67962958784, "step": 129650 }, { "epoch": 1.237346912004961, "grad_norm": 0.12730489671230316, "learning_rate": 0.0002983414227995975, "loss": 2.0552, "num_input_tokens_seen": 67989169536, "step": 129700 }, { "epoch": 1.2378239145211491, "grad_norm": 0.12583428621292114, "learning_rate": 0.000295777871532002, "loss": 2.0413, "num_input_tokens_seen": 68015382560, "step": 129750 }, { "epoch": 1.2383009170373374, "grad_norm": 0.12833881378173828, "learning_rate": 0.00029322074751673977, "loss": 2.0456, "num_input_tokens_seen": 68041596960, "step": 129800 }, { "epoch": 1.2387779195535256, "grad_norm": 0.1263890564441681, "learning_rate": 0.0002906701312312861, "loss": 2.0506, "num_input_tokens_seen": 68067805312, "step": 129850 }, { "epoch": 1.239254922069714, "grad_norm": 0.1265845000743866, "learning_rate": 0.0002881261029483057, "loss": 2.0376, "num_input_tokens_seen": 68094019712, "step": 129900 }, { "epoch": 1.2397319245859022, "grad_norm": 0.1379150003194809, "learning_rate": 0.0002855887427331267, "loss": 2.0482, "num_input_tokens_seen": 68120232192, "step": 129950 }, { "epoch": 1.2402089271020904, "grad_norm": 0.12455019354820251, "learning_rate": 0.00028305813044122096, "loss": 2.038, "num_input_tokens_seen": 68146442176, "step": 130000 }, { "epoch": 1.2402089271020904, "eval_loss": 1.965224266052246, "eval_runtime": 83.0846, "eval_samples_per_second": 60.18, "eval_steps_per_second": 15.045, "num_input_tokens_seen": 68146442176, "step": 130000 }, { "epoch": 1.2406859296182788, "grad_norm": 0.12637196481227875, "learning_rate": 0.00028053434571568983, "loss": 2.0543, "num_input_tokens_seen": 68172655040, "step": 130050 }, { "epoch": 1.241162932134467, "grad_norm": 0.1351892203092575, "learning_rate": 0.000278017467984759, "loss": 2.0578, "num_input_tokens_seen": 68198869440, "step": 130100 }, { "epoch": 1.2416399346506553, "grad_norm": 0.12203965336084366, "learning_rate": 0.00027550757645927764, "loss": 2.0427, "num_input_tokens_seen": 68225083840, "step": 130150 }, { "epoch": 1.2421169371668435, "grad_norm": 0.13395994901657104, "learning_rate": 0.00027300475013022663, "loss": 2.0488, "num_input_tokens_seen": 68251293952, "step": 130200 }, { "epoch": 1.242593939683032, "grad_norm": 0.1291465014219284, "learning_rate": 0.0002705090677662311, "loss": 2.0484, "num_input_tokens_seen": 68277498432, "step": 130250 }, { "epoch": 1.24307094219922, "grad_norm": 0.12472834438085556, "learning_rate": 0.000268020607911083, "loss": 2.0538, "num_input_tokens_seen": 68303709440, "step": 130300 }, { "epoch": 1.2435479447154083, "grad_norm": 0.1263572871685028, "learning_rate": 0.0002655394488812677, "loss": 2.0487, "num_input_tokens_seen": 68329920512, "step": 130350 }, { "epoch": 1.2440249472315967, "grad_norm": 0.12614773213863373, "learning_rate": 0.0002630656687635007, "loss": 2.053, "num_input_tokens_seen": 68356112384, "step": 130400 }, { "epoch": 1.244501949747785, "grad_norm": 0.1241307333111763, "learning_rate": 0.0002605993454122687, "loss": 2.049, "num_input_tokens_seen": 68382320896, "step": 130450 }, { "epoch": 1.2449789522639731, "grad_norm": 0.12764516472816467, "learning_rate": 0.0002581405564473801, "loss": 2.0338, "num_input_tokens_seen": 68408534464, "step": 130500 }, { "epoch": 1.2449789522639731, "eval_loss": 1.9643968343734741, "eval_runtime": 82.7385, "eval_samples_per_second": 60.431, "eval_steps_per_second": 15.108, "num_input_tokens_seen": 68408534464, "step": 130500 }, { "epoch": 1.2454559547801614, "grad_norm": 0.1308233141899109, "learning_rate": 0.0002556893792515227, "loss": 2.0371, "num_input_tokens_seen": 68434747040, "step": 130550 }, { "epoch": 1.2459329572963498, "grad_norm": 0.12745235860347748, "learning_rate": 0.00025324589096782657, "loss": 2.0373, "num_input_tokens_seen": 68460951616, "step": 130600 }, { "epoch": 1.246409959812538, "grad_norm": 0.1278812736272812, "learning_rate": 0.0002508101684974387, "loss": 2.0405, "num_input_tokens_seen": 68487165696, "step": 130650 }, { "epoch": 1.2468869623287262, "grad_norm": 0.12204719334840775, "learning_rate": 0.00024838228849709997, "loss": 2.0424, "num_input_tokens_seen": 68513380096, "step": 130700 }, { "epoch": 1.2473639648449146, "grad_norm": 0.11976956576108932, "learning_rate": 0.0002459623273767354, "loss": 2.0596, "num_input_tokens_seen": 68539590240, "step": 130750 }, { "epoch": 1.2478409673611028, "grad_norm": 0.13120809197425842, "learning_rate": 0.000243550361297047, "loss": 2.037, "num_input_tokens_seen": 68565804640, "step": 130800 }, { "epoch": 1.248317969877291, "grad_norm": 0.12905927002429962, "learning_rate": 0.00024114646616711844, "loss": 2.0341, "num_input_tokens_seen": 68592007552, "step": 130850 }, { "epoch": 1.2487949723934793, "grad_norm": 0.12697407603263855, "learning_rate": 0.00023875071764202561, "loss": 2.05, "num_input_tokens_seen": 68618221952, "step": 130900 }, { "epoch": 1.2492719749096677, "grad_norm": 0.12694934010505676, "learning_rate": 0.00023636319112045495, "loss": 2.0436, "num_input_tokens_seen": 68644425984, "step": 130950 }, { "epoch": 1.249748977425856, "grad_norm": 0.1360025703907013, "learning_rate": 0.00023398396174233177, "loss": 2.0506, "num_input_tokens_seen": 68670633664, "step": 131000 }, { "epoch": 1.249748977425856, "eval_loss": 1.962631106376648, "eval_runtime": 82.4327, "eval_samples_per_second": 60.656, "eval_steps_per_second": 15.164, "num_input_tokens_seen": 68670633664, "step": 131000 }, { "epoch": 1.2502259799420443, "grad_norm": 0.13041457533836365, "learning_rate": 0.000231613104386454, "loss": 2.0362, "num_input_tokens_seen": 68696842016, "step": 131050 }, { "epoch": 1.2507029824582325, "grad_norm": 0.1306309849023819, "learning_rate": 0.00022925069366813716, "loss": 2.0593, "num_input_tokens_seen": 68723054176, "step": 131100 }, { "epoch": 1.2511799849744207, "grad_norm": 0.12761172652244568, "learning_rate": 0.00022689680393686457, "loss": 2.0496, "num_input_tokens_seen": 68749263552, "step": 131150 }, { "epoch": 1.251656987490609, "grad_norm": 0.12187056988477707, "learning_rate": 0.0002245515092739488, "loss": 2.0417, "num_input_tokens_seen": 68775477952, "step": 131200 }, { "epoch": 1.2521339900067971, "grad_norm": 0.12770666182041168, "learning_rate": 0.00022221488349019903, "loss": 2.0332, "num_input_tokens_seen": 68801692352, "step": 131250 }, { "epoch": 1.2526109925229856, "grad_norm": 0.13457396626472473, "learning_rate": 0.00021988700012359863, "loss": 2.0393, "num_input_tokens_seen": 68827900832, "step": 131300 }, { "epoch": 1.2530879950391738, "grad_norm": 0.12845295667648315, "learning_rate": 0.0002175679324369913, "loss": 2.0507, "num_input_tokens_seen": 68854107328, "step": 131350 }, { "epoch": 1.2535649975553622, "grad_norm": 0.12990029156208038, "learning_rate": 0.00021525775341577403, "loss": 2.0373, "num_input_tokens_seen": 68880316256, "step": 131400 }, { "epoch": 1.2540420000715504, "grad_norm": 0.12344187498092651, "learning_rate": 0.00021295653576560165, "loss": 2.0359, "num_input_tokens_seen": 68906521376, "step": 131450 }, { "epoch": 1.2545190025877386, "grad_norm": 0.12487955391407013, "learning_rate": 0.00021066435191009715, "loss": 2.0432, "num_input_tokens_seen": 68932735776, "step": 131500 }, { "epoch": 1.2545190025877386, "eval_loss": 1.9613933563232422, "eval_runtime": 82.9225, "eval_samples_per_second": 60.297, "eval_steps_per_second": 15.074, "num_input_tokens_seen": 68932735776, "step": 131500 }, { "epoch": 1.2549960051039268, "grad_norm": 0.13224980235099792, "learning_rate": 0.00020838127398857382, "loss": 2.0413, "num_input_tokens_seen": 68958946656, "step": 131550 }, { "epoch": 1.2554730076201153, "grad_norm": 0.12449366599321365, "learning_rate": 0.00020610737385376348, "loss": 2.0503, "num_input_tokens_seen": 68985155520, "step": 131600 }, { "epoch": 1.2559500101363035, "grad_norm": 0.12943805754184723, "learning_rate": 0.0002038427230695565, "loss": 2.0476, "num_input_tokens_seen": 69011368384, "step": 131650 }, { "epoch": 1.2564270126524917, "grad_norm": 0.1288331300020218, "learning_rate": 0.00020158739290874821, "loss": 2.0458, "num_input_tokens_seen": 69037580736, "step": 131700 }, { "epoch": 1.25690401516868, "grad_norm": 0.12655895948410034, "learning_rate": 0.00019934145435079704, "loss": 2.0474, "num_input_tokens_seen": 69063793760, "step": 131750 }, { "epoch": 1.2573810176848683, "grad_norm": 0.1263783723115921, "learning_rate": 0.0001971049780795901, "loss": 2.0387, "num_input_tokens_seen": 69090002496, "step": 131800 }, { "epoch": 1.2578580202010565, "grad_norm": 0.13202515244483948, "learning_rate": 0.0001948780344812181, "loss": 2.0531, "num_input_tokens_seen": 69116216896, "step": 131850 }, { "epoch": 1.2583350227172447, "grad_norm": 0.12061940133571625, "learning_rate": 0.00019266069364176142, "loss": 2.052, "num_input_tokens_seen": 69142427680, "step": 131900 }, { "epoch": 1.2588120252334332, "grad_norm": 0.1222308874130249, "learning_rate": 0.00019045302534508295, "loss": 2.0409, "num_input_tokens_seen": 69168631136, "step": 131950 }, { "epoch": 1.2592890277496214, "grad_norm": 0.11664976924657822, "learning_rate": 0.00018825509907063325, "loss": 2.0361, "num_input_tokens_seen": 69194840608, "step": 132000 }, { "epoch": 1.2592890277496214, "eval_loss": 1.9602855443954468, "eval_runtime": 82.6066, "eval_samples_per_second": 60.528, "eval_steps_per_second": 15.132, "num_input_tokens_seen": 69194840608, "step": 132000 }, { "epoch": 1.2597660302658096, "grad_norm": 0.11991748213768005, "learning_rate": 0.0001860669839912626, "loss": 2.0354, "num_input_tokens_seen": 69221050496, "step": 132050 }, { "epoch": 1.260243032781998, "grad_norm": 0.11859247088432312, "learning_rate": 0.00018388874897104518, "loss": 2.0449, "num_input_tokens_seen": 69247257536, "step": 132100 }, { "epoch": 1.2607200352981862, "grad_norm": 0.12269642949104309, "learning_rate": 0.00018172046256311088, "loss": 2.0427, "num_input_tokens_seen": 69273469824, "step": 132150 }, { "epoch": 1.2611970378143744, "grad_norm": 0.11893275380134583, "learning_rate": 0.00017956219300748795, "loss": 2.0366, "num_input_tokens_seen": 69299684224, "step": 132200 }, { "epoch": 1.2616740403305626, "grad_norm": 0.12191104143857956, "learning_rate": 0.0001774140082289563, "loss": 2.0393, "num_input_tokens_seen": 69325894496, "step": 132250 }, { "epoch": 1.262151042846751, "grad_norm": 0.12704069912433624, "learning_rate": 0.00017527597583490823, "loss": 2.0551, "num_input_tokens_seen": 69352101952, "step": 132300 }, { "epoch": 1.2626280453629393, "grad_norm": 0.12682849168777466, "learning_rate": 0.00017314816311322218, "loss": 2.0376, "num_input_tokens_seen": 69378314752, "step": 132350 }, { "epoch": 1.2631050478791277, "grad_norm": 0.1246429830789566, "learning_rate": 0.00017103063703014372, "loss": 2.0402, "num_input_tokens_seen": 69404523776, "step": 132400 }, { "epoch": 1.263582050395316, "grad_norm": 0.12006555497646332, "learning_rate": 0.00016892346422817944, "loss": 2.0383, "num_input_tokens_seen": 69430732160, "step": 132450 }, { "epoch": 1.264059052911504, "grad_norm": 0.12435656785964966, "learning_rate": 0.00016682671102399805, "loss": 2.0347, "num_input_tokens_seen": 69456943424, "step": 132500 }, { "epoch": 1.264059052911504, "eval_loss": 1.9590063095092773, "eval_runtime": 82.7888, "eval_samples_per_second": 60.395, "eval_steps_per_second": 15.099, "num_input_tokens_seen": 69456943424, "step": 132500 }, { "epoch": 1.2645360554276923, "grad_norm": 0.12412598729133606, "learning_rate": 0.0001647404434063447, "loss": 2.0436, "num_input_tokens_seen": 69483146688, "step": 132550 }, { "epoch": 1.2650130579438805, "grad_norm": 0.12309623509645462, "learning_rate": 0.00016266472703396284, "loss": 2.028, "num_input_tokens_seen": 69509359968, "step": 132600 }, { "epoch": 1.265490060460069, "grad_norm": 0.12758532166481018, "learning_rate": 0.0001605996272335291, "loss": 2.041, "num_input_tokens_seen": 69535568960, "step": 132650 }, { "epoch": 1.2659670629762572, "grad_norm": 0.11922606080770493, "learning_rate": 0.00015854520899759655, "loss": 2.0308, "num_input_tokens_seen": 69561777024, "step": 132700 }, { "epoch": 1.2664440654924456, "grad_norm": 0.1239946112036705, "learning_rate": 0.00015650153698254916, "loss": 2.0336, "num_input_tokens_seen": 69587981952, "step": 132750 }, { "epoch": 1.2669210680086338, "grad_norm": 0.12584541738033295, "learning_rate": 0.00015446867550656767, "loss": 2.0376, "num_input_tokens_seen": 69614192832, "step": 132800 }, { "epoch": 1.267398070524822, "grad_norm": 0.12514598667621613, "learning_rate": 0.00015244668854760458, "loss": 2.0411, "num_input_tokens_seen": 69640405600, "step": 132850 }, { "epoch": 1.2678750730410102, "grad_norm": 0.12181352823972702, "learning_rate": 0.00015043563974137132, "loss": 2.0404, "num_input_tokens_seen": 69666619040, "step": 132900 }, { "epoch": 1.2683520755571986, "grad_norm": 0.11871461570262909, "learning_rate": 0.00014843559237933475, "loss": 2.0458, "num_input_tokens_seen": 69692833440, "step": 132950 }, { "epoch": 1.2688290780733869, "grad_norm": 0.12271245568990707, "learning_rate": 0.00014644660940672628, "loss": 2.0354, "num_input_tokens_seen": 69719047840, "step": 133000 }, { "epoch": 1.2688290780733869, "eval_loss": 1.9576880931854248, "eval_runtime": 82.558, "eval_samples_per_second": 60.564, "eval_steps_per_second": 15.141, "num_input_tokens_seen": 69719047840, "step": 133000 }, { "epoch": 1.269306080589575, "grad_norm": 0.12358897924423218, "learning_rate": 0.00014446875342055988, "loss": 2.0342, "num_input_tokens_seen": 69745262240, "step": 133050 }, { "epoch": 1.2697830831057635, "grad_norm": 0.12031599134206772, "learning_rate": 0.00014250208666766236, "loss": 2.0402, "num_input_tokens_seen": 69771476640, "step": 133100 }, { "epoch": 1.2702600856219517, "grad_norm": 0.12011140584945679, "learning_rate": 0.00014054667104271496, "loss": 2.0358, "num_input_tokens_seen": 69797691040, "step": 133150 }, { "epoch": 1.27073708813814, "grad_norm": 0.12352379411458969, "learning_rate": 0.00013860256808630427, "loss": 2.043, "num_input_tokens_seen": 69823902816, "step": 133200 }, { "epoch": 1.271214090654328, "grad_norm": 0.1257781833410263, "learning_rate": 0.00013666983898298656, "loss": 2.0464, "num_input_tokens_seen": 69850112224, "step": 133250 }, { "epoch": 1.2716910931705165, "grad_norm": 0.12694838643074036, "learning_rate": 0.00013474854455936125, "loss": 2.0401, "num_input_tokens_seen": 69876325568, "step": 133300 }, { "epoch": 1.2721680956867047, "grad_norm": 0.12634819746017456, "learning_rate": 0.00013283874528215734, "loss": 2.0339, "num_input_tokens_seen": 69902536928, "step": 133350 }, { "epoch": 1.272645098202893, "grad_norm": 0.12307710945606232, "learning_rate": 0.00013094050125632973, "loss": 2.0277, "num_input_tokens_seen": 69928748288, "step": 133400 }, { "epoch": 1.2731221007190814, "grad_norm": 0.12187953293323517, "learning_rate": 0.00012905387222316822, "loss": 2.0402, "num_input_tokens_seen": 69954953888, "step": 133450 }, { "epoch": 1.2735991032352696, "grad_norm": 0.12032655626535416, "learning_rate": 0.0001271789175584172, "loss": 2.0419, "num_input_tokens_seen": 69981165632, "step": 133500 }, { "epoch": 1.2735991032352696, "eval_loss": 1.9568681716918945, "eval_runtime": 82.7406, "eval_samples_per_second": 60.43, "eval_steps_per_second": 15.107, "num_input_tokens_seen": 69981165632, "step": 133500 }, { "epoch": 1.2740761057514578, "grad_norm": 0.12817110121250153, "learning_rate": 0.00012531569627040635, "loss": 2.034, "num_input_tokens_seen": 70007368800, "step": 133550 }, { "epoch": 1.274553108267646, "grad_norm": 0.13095012307167053, "learning_rate": 0.00012346426699819457, "loss": 2.0346, "num_input_tokens_seen": 70033578048, "step": 133600 }, { "epoch": 1.2750301107838344, "grad_norm": 0.12582357227802277, "learning_rate": 0.00012162468800972342, "loss": 2.0398, "num_input_tokens_seen": 70059792448, "step": 133650 }, { "epoch": 1.2755071133000226, "grad_norm": 0.11612017452716827, "learning_rate": 0.00011979701719998454, "loss": 2.0341, "num_input_tokens_seen": 70086003648, "step": 133700 }, { "epoch": 1.2759841158162109, "grad_norm": 0.12256049364805222, "learning_rate": 0.00011798131208919626, "loss": 2.029, "num_input_tokens_seen": 70112204096, "step": 133750 }, { "epoch": 1.2764611183323993, "grad_norm": 0.11747635900974274, "learning_rate": 0.00011617762982099444, "loss": 2.0355, "num_input_tokens_seen": 70138411104, "step": 133800 }, { "epoch": 1.2769381208485875, "grad_norm": 0.12225272506475449, "learning_rate": 0.00011438602716063329, "loss": 2.042, "num_input_tokens_seen": 70164623328, "step": 133850 }, { "epoch": 1.2774151233647757, "grad_norm": 0.1293225735425949, "learning_rate": 0.00011260656049319957, "loss": 2.0367, "num_input_tokens_seen": 70190833888, "step": 133900 }, { "epoch": 1.277892125880964, "grad_norm": 0.12261593341827393, "learning_rate": 0.0001108392858218371, "loss": 2.0444, "num_input_tokens_seen": 70217043648, "step": 133950 }, { "epoch": 1.2783691283971523, "grad_norm": 0.11957214772701263, "learning_rate": 0.0001090842587659851, "loss": 2.0345, "num_input_tokens_seen": 70243253472, "step": 134000 }, { "epoch": 1.2783691283971523, "eval_loss": 1.955412745475769, "eval_runtime": 82.5981, "eval_samples_per_second": 60.534, "eval_steps_per_second": 15.134, "num_input_tokens_seen": 70243253472, "step": 134000 }, { "epoch": 1.2788461309133405, "grad_norm": 0.12490282952785492, "learning_rate": 0.00010734153455962764, "loss": 2.0308, "num_input_tokens_seen": 70269466208, "step": 134050 }, { "epoch": 1.279323133429529, "grad_norm": 0.12396061420440674, "learning_rate": 0.00010561116804955451, "loss": 2.036, "num_input_tokens_seen": 70295676096, "step": 134100 }, { "epoch": 1.2798001359457172, "grad_norm": 0.12122515588998795, "learning_rate": 0.00010389321369363636, "loss": 2.0424, "num_input_tokens_seen": 70321882272, "step": 134150 }, { "epoch": 1.2802771384619054, "grad_norm": 0.12559206783771515, "learning_rate": 0.00010218772555910954, "loss": 2.0456, "num_input_tokens_seen": 70348095808, "step": 134200 }, { "epoch": 1.2807541409780936, "grad_norm": 0.11915505677461624, "learning_rate": 0.0001004947573208756, "loss": 2.0412, "num_input_tokens_seen": 70374304800, "step": 134250 }, { "epoch": 1.2812311434942818, "grad_norm": 0.12196268141269684, "learning_rate": 9.881436225981105e-05, "loss": 2.0386, "num_input_tokens_seen": 70400510976, "step": 134300 }, { "epoch": 1.2817081460104702, "grad_norm": 0.12415535002946854, "learning_rate": 9.714659326109137e-05, "loss": 2.0448, "num_input_tokens_seen": 70426725376, "step": 134350 }, { "epoch": 1.2821851485266584, "grad_norm": 0.12361661344766617, "learning_rate": 9.549150281252633e-05, "loss": 2.0371, "num_input_tokens_seen": 70452929792, "step": 134400 }, { "epoch": 1.2826621510428469, "grad_norm": 0.12377167493104935, "learning_rate": 9.384914300290748e-05, "loss": 2.0344, "num_input_tokens_seen": 70479144192, "step": 134450 }, { "epoch": 1.283139153559035, "grad_norm": 0.11863281577825546, "learning_rate": 9.221956552036992e-05, "loss": 2.0393, "num_input_tokens_seen": 70505353504, "step": 134500 }, { "epoch": 1.283139153559035, "eval_loss": 1.9545812606811523, "eval_runtime": 82.3767, "eval_samples_per_second": 60.697, "eval_steps_per_second": 15.174, "num_input_tokens_seen": 70505353504, "step": 134500 }, { "epoch": 1.2836161560752233, "grad_norm": 0.12550202012062073, "learning_rate": 9.060282165076461e-05, "loss": 2.0483, "num_input_tokens_seen": 70531564640, "step": 134550 }, { "epoch": 1.2840931585914115, "grad_norm": 0.12165137380361557, "learning_rate": 8.899896227604509e-05, "loss": 2.034, "num_input_tokens_seen": 70557777824, "step": 134600 }, { "epoch": 1.2845701611076, "grad_norm": 0.12417840212583542, "learning_rate": 8.740803787266521e-05, "loss": 2.0381, "num_input_tokens_seen": 70583987456, "step": 134650 }, { "epoch": 1.2850471636237881, "grad_norm": 0.12609820067882538, "learning_rate": 8.58300985099918e-05, "loss": 2.0369, "num_input_tokens_seen": 70610189152, "step": 134700 }, { "epoch": 1.2855241661399763, "grad_norm": 0.1163376122713089, "learning_rate": 8.426519384872733e-05, "loss": 2.0236, "num_input_tokens_seen": 70636401088, "step": 134750 }, { "epoch": 1.2860011686561648, "grad_norm": 0.11958843469619751, "learning_rate": 8.271337313934868e-05, "loss": 2.0465, "num_input_tokens_seen": 70662608672, "step": 134800 }, { "epoch": 1.286478171172353, "grad_norm": 0.12234240025281906, "learning_rate": 8.117468522055577e-05, "loss": 2.0384, "num_input_tokens_seen": 70688820640, "step": 134850 }, { "epoch": 1.2869551736885412, "grad_norm": 0.11501733213663101, "learning_rate": 7.964917851773496e-05, "loss": 2.0343, "num_input_tokens_seen": 70715035040, "step": 134900 }, { "epoch": 1.2874321762047294, "grad_norm": 0.12062328308820724, "learning_rate": 7.813690104143555e-05, "loss": 2.0211, "num_input_tokens_seen": 70741249088, "step": 134950 }, { "epoch": 1.2879091787209178, "grad_norm": 0.11405592411756516, "learning_rate": 7.663790038585794e-05, "loss": 2.0401, "num_input_tokens_seen": 70767457344, "step": 135000 }, { "epoch": 1.2879091787209178, "eval_loss": 1.9541493654251099, "eval_runtime": 82.5619, "eval_samples_per_second": 60.561, "eval_steps_per_second": 15.14, "num_input_tokens_seen": 70767457344, "step": 135000 }, { "epoch": 1.288386181237106, "grad_norm": 0.1237749382853508, "learning_rate": 7.515222372735647e-05, "loss": 2.029, "num_input_tokens_seen": 70793671744, "step": 135050 }, { "epoch": 1.2888631837532942, "grad_norm": 0.11638092249631882, "learning_rate": 7.367991782295391e-05, "loss": 2.0171, "num_input_tokens_seen": 70819879168, "step": 135100 }, { "epoch": 1.2893401862694827, "grad_norm": 0.11938998103141785, "learning_rate": 7.222102900887101e-05, "loss": 2.0232, "num_input_tokens_seen": 70846079616, "step": 135150 }, { "epoch": 1.2898171887856709, "grad_norm": 0.11985292285680771, "learning_rate": 7.077560319906695e-05, "loss": 2.0387, "num_input_tokens_seen": 70872294016, "step": 135200 }, { "epoch": 1.290294191301859, "grad_norm": 0.12651756405830383, "learning_rate": 6.934368588379552e-05, "loss": 2.0345, "num_input_tokens_seen": 70898498624, "step": 135250 }, { "epoch": 1.2907711938180473, "grad_norm": 0.12012086063623428, "learning_rate": 6.792532212817271e-05, "loss": 2.0362, "num_input_tokens_seen": 70924710048, "step": 135300 }, { "epoch": 1.2912481963342357, "grad_norm": 0.12295469641685486, "learning_rate": 6.652055657075845e-05, "loss": 2.0338, "num_input_tokens_seen": 70950915200, "step": 135350 }, { "epoch": 1.291725198850424, "grad_norm": 0.12192966043949127, "learning_rate": 6.512943342215233e-05, "loss": 2.0311, "num_input_tokens_seen": 70977118208, "step": 135400 }, { "epoch": 1.2922022013666123, "grad_norm": 0.1188386008143425, "learning_rate": 6.375199646360142e-05, "loss": 2.0311, "num_input_tokens_seen": 71003331520, "step": 135450 }, { "epoch": 1.2926792038828006, "grad_norm": 0.11646123230457306, "learning_rate": 6.238828904562316e-05, "loss": 2.037, "num_input_tokens_seen": 71029545920, "step": 135500 }, { "epoch": 1.2926792038828006, "eval_loss": 1.9530843496322632, "eval_runtime": 82.2362, "eval_samples_per_second": 60.8, "eval_steps_per_second": 15.2, "num_input_tokens_seen": 71029545920, "step": 135500 }, { "epoch": 1.2931562063989888, "grad_norm": 0.12359626591205597, "learning_rate": 6.103835408664032e-05, "loss": 2.0441, "num_input_tokens_seen": 71055753312, "step": 135550 }, { "epoch": 1.293633208915177, "grad_norm": 0.12097882479429245, "learning_rate": 5.9702234071631e-05, "loss": 2.0251, "num_input_tokens_seen": 71081964480, "step": 135600 }, { "epoch": 1.2941102114313652, "grad_norm": 0.11585067212581635, "learning_rate": 5.83799710507909e-05, "loss": 2.0352, "num_input_tokens_seen": 71108163424, "step": 135650 }, { "epoch": 1.2945872139475536, "grad_norm": 0.12164249271154404, "learning_rate": 5.7071606638210094e-05, "loss": 2.0314, "num_input_tokens_seen": 71134375424, "step": 135700 }, { "epoch": 1.2950642164637418, "grad_norm": 0.11601755023002625, "learning_rate": 5.577718201056392e-05, "loss": 2.0313, "num_input_tokens_seen": 71160582688, "step": 135750 }, { "epoch": 1.2955412189799302, "grad_norm": 0.11863810569047928, "learning_rate": 5.449673790581611e-05, "loss": 2.036, "num_input_tokens_seen": 71186792800, "step": 135800 }, { "epoch": 1.2960182214961184, "grad_norm": 0.12455905973911285, "learning_rate": 5.3230314621937556e-05, "loss": 2.0316, "num_input_tokens_seen": 71213000416, "step": 135850 }, { "epoch": 1.2964952240123067, "grad_norm": 0.11861378699541092, "learning_rate": 5.197795201563743e-05, "loss": 2.0334, "num_input_tokens_seen": 71239212224, "step": 135900 }, { "epoch": 1.2969722265284949, "grad_norm": 0.11894825845956802, "learning_rate": 5.073968950110941e-05, "loss": 2.028, "num_input_tokens_seen": 71265425728, "step": 135950 }, { "epoch": 1.297449229044683, "grad_norm": 0.11746333539485931, "learning_rate": 4.9515566048790485e-05, "loss": 2.0302, "num_input_tokens_seen": 71291638272, "step": 136000 }, { "epoch": 1.297449229044683, "eval_loss": 1.9527229070663452, "eval_runtime": 82.9319, "eval_samples_per_second": 60.29, "eval_steps_per_second": 15.073, "num_input_tokens_seen": 71291638272, "step": 136000 }, { "epoch": 1.2979262315608715, "grad_norm": 0.1190498098731041, "learning_rate": 4.8305620184135315e-05, "loss": 2.0321, "num_input_tokens_seen": 71317844512, "step": 136050 }, { "epoch": 1.2984032340770597, "grad_norm": 0.11770997196435928, "learning_rate": 4.7109889986402973e-05, "loss": 2.0341, "num_input_tokens_seen": 71344050560, "step": 136100 }, { "epoch": 1.2988802365932481, "grad_norm": 0.11683844774961472, "learning_rate": 4.592841308745932e-05, "loss": 2.0243, "num_input_tokens_seen": 71370258656, "step": 136150 }, { "epoch": 1.2993572391094363, "grad_norm": 0.12114414572715759, "learning_rate": 4.476122667059207e-05, "loss": 2.0379, "num_input_tokens_seen": 71396470656, "step": 136200 }, { "epoch": 1.2998342416256246, "grad_norm": 0.11975762993097305, "learning_rate": 4.3608367469340547e-05, "loss": 2.0359, "num_input_tokens_seen": 71422685056, "step": 136250 }, { "epoch": 1.3003112441418128, "grad_norm": 0.11278797686100006, "learning_rate": 4.2469871766340095e-05, "loss": 2.0219, "num_input_tokens_seen": 71448892928, "step": 136300 }, { "epoch": 1.3007882466580012, "grad_norm": 0.11854268610477448, "learning_rate": 4.1345775392179654e-05, "loss": 2.0404, "num_input_tokens_seen": 71475094528, "step": 136350 }, { "epoch": 1.3012652491741894, "grad_norm": 0.11631016433238983, "learning_rate": 4.0236113724274713e-05, "loss": 2.0301, "num_input_tokens_seen": 71501303968, "step": 136400 }, { "epoch": 1.3017422516903776, "grad_norm": 0.11170602589845657, "learning_rate": 3.9140921685753064e-05, "loss": 2.0431, "num_input_tokens_seen": 71527518368, "step": 136450 }, { "epoch": 1.302219254206566, "grad_norm": 0.11311063915491104, "learning_rate": 3.806023374435663e-05, "loss": 2.0173, "num_input_tokens_seen": 71553726688, "step": 136500 }, { "epoch": 1.302219254206566, "eval_loss": 1.9524949789047241, "eval_runtime": 83.0874, "eval_samples_per_second": 60.178, "eval_steps_per_second": 15.044, "num_input_tokens_seen": 71553726688, "step": 136500 }, { "epoch": 1.3026962567227542, "grad_norm": 0.728589653968811, "learning_rate": 3.699408391135611e-05, "loss": 2.0415, "num_input_tokens_seen": 71579934304, "step": 136550 }, { "epoch": 1.3031732592389424, "grad_norm": 0.11253057420253754, "learning_rate": 3.594250574048058e-05, "loss": 2.0334, "num_input_tokens_seen": 71606145184, "step": 136600 }, { "epoch": 1.3036502617551307, "grad_norm": 0.12201691418886185, "learning_rate": 3.4905532326861944e-05, "loss": 2.0403, "num_input_tokens_seen": 71632351648, "step": 136650 }, { "epoch": 1.304127264271319, "grad_norm": 0.11976749449968338, "learning_rate": 3.3883196305992905e-05, "loss": 2.0292, "num_input_tokens_seen": 71658566048, "step": 136700 }, { "epoch": 1.3046042667875073, "grad_norm": 0.12131944298744202, "learning_rate": 3.2875529852700146e-05, "loss": 2.0405, "num_input_tokens_seen": 71684775808, "step": 136750 }, { "epoch": 1.3050812693036955, "grad_norm": 0.11625051498413086, "learning_rate": 3.18825646801314e-05, "loss": 2.0392, "num_input_tokens_seen": 71710990048, "step": 136800 }, { "epoch": 1.305558271819884, "grad_norm": 0.11870067566633224, "learning_rate": 3.0904332038757974e-05, "loss": 2.0388, "num_input_tokens_seen": 71737198176, "step": 136850 }, { "epoch": 1.3060352743360721, "grad_norm": 0.11490604281425476, "learning_rate": 2.994086271539048e-05, "loss": 2.0261, "num_input_tokens_seen": 71763409248, "step": 136900 }, { "epoch": 1.3065122768522603, "grad_norm": 0.1218944787979126, "learning_rate": 2.8992187032210516e-05, "loss": 2.0421, "num_input_tokens_seen": 71789610880, "step": 136950 }, { "epoch": 1.3069892793684486, "grad_norm": 0.11681609600782394, "learning_rate": 2.8058334845816213e-05, "loss": 2.0287, "num_input_tokens_seen": 71815816608, "step": 137000 }, { "epoch": 1.3069892793684486, "eval_loss": 1.951898455619812, "eval_runtime": 82.7779, "eval_samples_per_second": 60.403, "eval_steps_per_second": 15.101, "num_input_tokens_seen": 71815816608, "step": 137000 }, { "epoch": 1.307466281884637, "grad_norm": 0.11646866798400879, "learning_rate": 2.7139335546282283e-05, "loss": 2.0325, "num_input_tokens_seen": 71842030368, "step": 137050 }, { "epoch": 1.3079432844008252, "grad_norm": 0.10989837348461151, "learning_rate": 2.6235218056235634e-05, "loss": 2.0325, "num_input_tokens_seen": 71868244768, "step": 137100 }, { "epoch": 1.3084202869170136, "grad_norm": 0.11658209562301636, "learning_rate": 2.5346010829944367e-05, "loss": 2.0289, "num_input_tokens_seen": 71894452160, "step": 137150 }, { "epoch": 1.3088972894332018, "grad_norm": 0.11487242579460144, "learning_rate": 2.4471741852423235e-05, "loss": 2.0322, "num_input_tokens_seen": 71920664928, "step": 137200 }, { "epoch": 1.30937429194939, "grad_norm": 0.11544458568096161, "learning_rate": 2.3612438638551835e-05, "loss": 2.0279, "num_input_tokens_seen": 71946876896, "step": 137250 }, { "epoch": 1.3098512944655782, "grad_norm": 0.11500503867864609, "learning_rate": 2.276812823220964e-05, "loss": 2.0399, "num_input_tokens_seen": 71973091200, "step": 137300 }, { "epoch": 1.3103282969817664, "grad_norm": 0.11575910449028015, "learning_rate": 2.1938837205424e-05, "loss": 2.0246, "num_input_tokens_seen": 71999300832, "step": 137350 }, { "epoch": 1.3108052994979549, "grad_norm": 0.1175985336303711, "learning_rate": 2.1124591657534777e-05, "loss": 2.0225, "num_input_tokens_seen": 72025515232, "step": 137400 }, { "epoch": 1.311282302014143, "grad_norm": 0.11688115447759628, "learning_rate": 2.032541721437209e-05, "loss": 2.024, "num_input_tokens_seen": 72051723040, "step": 137450 }, { "epoch": 1.3117593045303315, "grad_norm": 0.11419174075126648, "learning_rate": 1.9541339027450256e-05, "loss": 2.0254, "num_input_tokens_seen": 72077935168, "step": 137500 }, { "epoch": 1.3117593045303315, "eval_loss": 1.951472282409668, "eval_runtime": 83.1149, "eval_samples_per_second": 60.158, "eval_steps_per_second": 15.039, "num_input_tokens_seen": 72077935168, "step": 137500 }, { "epoch": 1.3122363070465197, "grad_norm": 0.11731937527656555, "learning_rate": 1.8772381773176416e-05, "loss": 2.0368, "num_input_tokens_seen": 72104145664, "step": 137550 }, { "epoch": 1.312713309562708, "grad_norm": 0.11281976848840714, "learning_rate": 1.801856965207338e-05, "loss": 2.0243, "num_input_tokens_seen": 72130351488, "step": 137600 }, { "epoch": 1.3131903120788961, "grad_norm": 0.12566816806793213, "learning_rate": 1.7279926388018564e-05, "loss": 2.0266, "num_input_tokens_seen": 72156564000, "step": 137650 }, { "epoch": 1.3136673145950846, "grad_norm": 0.1202327162027359, "learning_rate": 1.6556475227496815e-05, "loss": 2.0344, "num_input_tokens_seen": 72182768800, "step": 137700 }, { "epoch": 1.3141443171112728, "grad_norm": 0.11209400743246078, "learning_rate": 1.584823893886933e-05, "loss": 2.0307, "num_input_tokens_seen": 72208977472, "step": 137750 }, { "epoch": 1.314621319627461, "grad_norm": 0.11281031370162964, "learning_rate": 1.5155239811656562e-05, "loss": 2.0285, "num_input_tokens_seen": 72235186752, "step": 137800 }, { "epoch": 1.3150983221436494, "grad_norm": 0.11977609992027283, "learning_rate": 1.4477499655837278e-05, "loss": 2.0307, "num_input_tokens_seen": 72261390432, "step": 137850 }, { "epoch": 1.3155753246598376, "grad_norm": 0.11602313071489334, "learning_rate": 1.3815039801161721e-05, "loss": 2.0272, "num_input_tokens_seen": 72287596960, "step": 137900 }, { "epoch": 1.3160523271760258, "grad_norm": 0.11629103124141693, "learning_rate": 1.3167881096480372e-05, "loss": 2.0426, "num_input_tokens_seen": 72313806912, "step": 137950 }, { "epoch": 1.316529329692214, "grad_norm": 0.11337430030107498, "learning_rate": 1.2536043909088191e-05, "loss": 2.0286, "num_input_tokens_seen": 72340003200, "step": 138000 }, { "epoch": 1.316529329692214, "eval_loss": 1.9512444734573364, "eval_runtime": 82.1325, "eval_samples_per_second": 60.877, "eval_steps_per_second": 15.219, "num_input_tokens_seen": 72340003200, "step": 138000 }, { "epoch": 1.3170063322084025, "grad_norm": 0.11734651029109955, "learning_rate": 1.191954812408308e-05, "loss": 2.0241, "num_input_tokens_seen": 72366217600, "step": 138050 }, { "epoch": 1.3174833347245907, "grad_norm": 0.11315104365348816, "learning_rate": 1.1318413143740436e-05, "loss": 2.0195, "num_input_tokens_seen": 72392425632, "step": 138100 }, { "epoch": 1.3179603372407789, "grad_norm": 0.11212780326604843, "learning_rate": 1.0732657886902309e-05, "loss": 2.0379, "num_input_tokens_seen": 72418637536, "step": 138150 }, { "epoch": 1.3184373397569673, "grad_norm": 0.11390957236289978, "learning_rate": 1.0162300788382261e-05, "loss": 2.0245, "num_input_tokens_seen": 72444850752, "step": 138200 }, { "epoch": 1.3189143422731555, "grad_norm": 0.11521212011575699, "learning_rate": 9.607359798384786e-06, "loss": 2.0313, "num_input_tokens_seen": 72471060032, "step": 138250 }, { "epoch": 1.3193913447893437, "grad_norm": 0.11375854164361954, "learning_rate": 9.0678523819408e-06, "loss": 2.0313, "num_input_tokens_seen": 72497274432, "step": 138300 }, { "epoch": 1.319868347305532, "grad_norm": 0.11399056017398834, "learning_rate": 8.543795518357766e-06, "loss": 2.0256, "num_input_tokens_seen": 72523485952, "step": 138350 }, { "epoch": 1.3203453498217204, "grad_norm": 0.11128194630146027, "learning_rate": 8.035205700685167e-06, "loss": 2.0338, "num_input_tokens_seen": 72549700352, "step": 138400 }, { "epoch": 1.3208223523379086, "grad_norm": 0.11179857701063156, "learning_rate": 7.542098935195918e-06, "loss": 2.0362, "num_input_tokens_seen": 72575912992, "step": 138450 }, { "epoch": 1.3212993548540968, "grad_norm": 0.11500924825668335, "learning_rate": 7.064490740882057e-06, "loss": 2.0285, "num_input_tokens_seen": 72602127392, "step": 138500 }, { "epoch": 1.3212993548540968, "eval_loss": 1.951123833656311, "eval_runtime": 82.6672, "eval_samples_per_second": 60.484, "eval_steps_per_second": 15.121, "num_input_tokens_seen": 72602127392, "step": 138500 }, { "epoch": 1.3217763573702852, "grad_norm": 0.1176285520195961, "learning_rate": 6.602396148966794e-06, "loss": 2.0295, "num_input_tokens_seen": 72628340704, "step": 138550 }, { "epoch": 1.3222533598864734, "grad_norm": 0.11359469592571259, "learning_rate": 6.15582970243117e-06, "loss": 2.0206, "num_input_tokens_seen": 72654548704, "step": 138600 }, { "epoch": 1.3227303624026616, "grad_norm": 0.11230379343032837, "learning_rate": 5.72480545555637e-06, "loss": 2.0285, "num_input_tokens_seen": 72680760704, "step": 138650 }, { "epoch": 1.3232073649188498, "grad_norm": 0.11325126886367798, "learning_rate": 5.309336973481682e-06, "loss": 2.0316, "num_input_tokens_seen": 72706975104, "step": 138700 }, { "epoch": 1.3236843674350383, "grad_norm": 0.11530512571334839, "learning_rate": 4.909437331777178e-06, "loss": 2.0295, "num_input_tokens_seen": 72733189504, "step": 138750 }, { "epoch": 1.3241613699512265, "grad_norm": 0.11637042462825775, "learning_rate": 4.52511911603265e-06, "loss": 2.0358, "num_input_tokens_seen": 72759403904, "step": 138800 }, { "epoch": 1.324638372467415, "grad_norm": 0.11307495832443237, "learning_rate": 4.15639442146093e-06, "loss": 2.0256, "num_input_tokens_seen": 72785609280, "step": 138850 }, { "epoch": 1.325115374983603, "grad_norm": 0.11408944427967072, "learning_rate": 3.803274852517968e-06, "loss": 2.0432, "num_input_tokens_seen": 72811823680, "step": 138900 }, { "epoch": 1.3255923774997913, "grad_norm": 0.11304306238889694, "learning_rate": 3.4657715225368535e-06, "loss": 2.0342, "num_input_tokens_seen": 72838035008, "step": 138950 }, { "epoch": 1.3260693800159795, "grad_norm": 0.11682960391044617, "learning_rate": 3.143895053378698e-06, "loss": 2.0353, "num_input_tokens_seen": 72864248896, "step": 139000 }, { "epoch": 1.3260693800159795, "eval_loss": 1.9510550498962402, "eval_runtime": 82.5623, "eval_samples_per_second": 60.56, "eval_steps_per_second": 15.14, "num_input_tokens_seen": 72864248896, "step": 139000 }, { "epoch": 1.3265463825321677, "grad_norm": 0.11243559420108795, "learning_rate": 2.837655575097964e-06, "loss": 2.0318, "num_input_tokens_seen": 72890458688, "step": 139050 }, { "epoch": 1.3270233850483562, "grad_norm": 0.11617834120988846, "learning_rate": 2.547062725623828e-06, "loss": 2.0384, "num_input_tokens_seen": 72916673088, "step": 139100 }, { "epoch": 1.3275003875645444, "grad_norm": 0.11737903952598572, "learning_rate": 2.2721256504567023e-06, "loss": 2.0235, "num_input_tokens_seen": 72942884768, "step": 139150 }, { "epoch": 1.3279773900807328, "grad_norm": 0.10866422206163406, "learning_rate": 2.012853002380466e-06, "loss": 2.024, "num_input_tokens_seen": 72969088544, "step": 139200 }, { "epoch": 1.328454392596921, "grad_norm": 0.11547800898551941, "learning_rate": 1.769252941190458e-06, "loss": 2.0323, "num_input_tokens_seen": 72995301472, "step": 139250 }, { "epoch": 1.3289313951131092, "grad_norm": 0.11617856472730637, "learning_rate": 1.541333133436018e-06, "loss": 2.0294, "num_input_tokens_seen": 73021507392, "step": 139300 }, { "epoch": 1.3294083976292974, "grad_norm": 0.11435816437005997, "learning_rate": 1.3291007521799014e-06, "loss": 2.0288, "num_input_tokens_seen": 73047719968, "step": 139350 }, { "epoch": 1.3298854001454858, "grad_norm": 0.11262206733226776, "learning_rate": 1.132562476771959e-06, "loss": 2.0301, "num_input_tokens_seen": 73073924576, "step": 139400 }, { "epoch": 1.330362402661674, "grad_norm": 0.11383078992366791, "learning_rate": 9.517244926393609e-07, "loss": 2.0187, "num_input_tokens_seen": 73100138976, "step": 139450 }, { "epoch": 1.3308394051778623, "grad_norm": 0.1159028634428978, "learning_rate": 7.865924910916978e-07, "loss": 2.0366, "num_input_tokens_seen": 73126349984, "step": 139500 }, { "epoch": 1.3308394051778623, "eval_loss": 1.9510103464126587, "eval_runtime": 82.8489, "eval_samples_per_second": 60.351, "eval_steps_per_second": 15.088, "num_input_tokens_seen": 73126349984, "step": 139500 }, { "epoch": 1.3313164076940507, "grad_norm": 0.1160767450928688, "learning_rate": 6.371716691419005e-07, "loss": 2.0374, "num_input_tokens_seen": 73152559296, "step": 139550 }, { "epoch": 1.331793410210239, "grad_norm": 0.11154640465974808, "learning_rate": 5.034667293427053e-07, "loss": 2.0385, "num_input_tokens_seen": 73178773696, "step": 139600 }, { "epoch": 1.332270412726427, "grad_norm": 0.11127237975597382, "learning_rate": 3.854818796385495e-07, "loss": 2.0281, "num_input_tokens_seen": 73204985664, "step": 139650 }, { "epoch": 1.3327474152426153, "grad_norm": 0.11270651966333389, "learning_rate": 2.8322083323334415e-07, "loss": 2.022, "num_input_tokens_seen": 73231192992, "step": 139700 }, { "epoch": 1.3332244177588037, "grad_norm": 0.11388963460922241, "learning_rate": 1.9668680847356734e-07, "loss": 2.0305, "num_input_tokens_seen": 73257397792, "step": 139750 }, { "epoch": 1.333701420274992, "grad_norm": 0.11808367073535919, "learning_rate": 1.2588252874673466e-07, "loss": 2.0302, "num_input_tokens_seen": 73283607648, "step": 139800 }, { "epoch": 1.3341784227911802, "grad_norm": 0.11369805783033371, "learning_rate": 7.081022239591173e-08, "loss": 2.0355, "num_input_tokens_seen": 73309822048, "step": 139850 }, { "epoch": 1.3346554253073686, "grad_norm": 0.11115424335002899, "learning_rate": 3.147162264971471e-08, "loss": 2.027, "num_input_tokens_seen": 73336032384, "step": 139900 }, { "epoch": 1.3351324278235568, "grad_norm": 0.11730392277240753, "learning_rate": 7.867967567354306e-09, "loss": 2.0268, "num_input_tokens_seen": 73362242112, "step": 139950 }, { "epoch": 1.335609430339745, "grad_norm": 0.11209023743867874, "learning_rate": 0.0, "loss": 2.0315, "num_input_tokens_seen": 73388446624, "step": 140000 }, { "epoch": 1.335609430339745, "eval_loss": 1.9509990215301514, "eval_runtime": 82.6099, "eval_samples_per_second": 60.525, "eval_steps_per_second": 15.131, "num_input_tokens_seen": 73388446624, "step": 140000 }, { "epoch": 3.076397950841334, "grad_norm": 0.09252593666315079, "learning_rate": 0.0001, "loss": 2.3535, "num_input_tokens_seen": 73440875424, "step": 140050 }, { "epoch": 3.0774962537573973, "grad_norm": 0.08520153909921646, "learning_rate": 0.0001, "loss": 2.3529, "num_input_tokens_seen": 73493304224, "step": 140100 }, { "epoch": 3.07859455667346, "grad_norm": 0.09475487470626831, "learning_rate": 0.0001, "loss": 2.3539, "num_input_tokens_seen": 73545729952, "step": 140150 }, { "epoch": 3.079692859589523, "grad_norm": 0.08525670319795609, "learning_rate": 0.0001, "loss": 2.3603, "num_input_tokens_seen": 73598155232, "step": 140200 }, { "epoch": 3.080791162505586, "grad_norm": 0.09414695203304291, "learning_rate": 0.0001, "loss": 2.3596, "num_input_tokens_seen": 73650584032, "step": 140250 }, { "epoch": 3.0818894654216487, "grad_norm": 0.08829599618911743, "learning_rate": 0.0001, "loss": 2.3582, "num_input_tokens_seen": 73703009408, "step": 140300 }, { "epoch": 3.082987768337712, "grad_norm": 0.08346480131149292, "learning_rate": 0.0001, "loss": 2.3473, "num_input_tokens_seen": 73755435104, "step": 140350 }, { "epoch": 3.0840860712537745, "grad_norm": 0.09302923828363419, "learning_rate": 0.0001, "loss": 2.3555, "num_input_tokens_seen": 73807860000, "step": 140400 }, { "epoch": 3.0851843741698373, "grad_norm": 0.08695721626281738, "learning_rate": 0.0001, "loss": 2.3578, "num_input_tokens_seen": 73860288800, "step": 140450 }, { "epoch": 3.0862826770859004, "grad_norm": 0.09424284100532532, "learning_rate": 0.0001, "loss": 2.3523, "num_input_tokens_seen": 73912717600, "step": 140500 }, { "epoch": 3.0862826770859004, "eval_loss": 2.2698493003845215, "eval_runtime": 81.2331, "eval_samples_per_second": 61.551, "eval_steps_per_second": 15.388, "num_input_tokens_seen": 73912717600, "step": 140500 }, { "epoch": 3.087380980001963, "grad_norm": 0.08606674522161484, "learning_rate": 0.0001, "loss": 2.3589, "num_input_tokens_seen": 73965145984, "step": 140550 }, { "epoch": 3.0884792829180263, "grad_norm": 0.09220123291015625, "learning_rate": 0.0001, "loss": 2.3503, "num_input_tokens_seen": 74017574784, "step": 140600 }, { "epoch": 3.089577585834089, "grad_norm": 0.10021138191223145, "learning_rate": 0.0001, "loss": 2.3528, "num_input_tokens_seen": 74070003040, "step": 140650 }, { "epoch": 3.0906758887501518, "grad_norm": 0.08400563895702362, "learning_rate": 0.0001, "loss": 2.3575, "num_input_tokens_seen": 74122431840, "step": 140700 }, { "epoch": 3.091774191666215, "grad_norm": 0.08861430734395981, "learning_rate": 0.0001, "loss": 2.3552, "num_input_tokens_seen": 74174859680, "step": 140750 }, { "epoch": 3.0928724945822776, "grad_norm": 0.08466708660125732, "learning_rate": 0.0001, "loss": 2.3603, "num_input_tokens_seen": 74227284768, "step": 140800 }, { "epoch": 3.0939707974983404, "grad_norm": 0.08707701414823532, "learning_rate": 0.0001, "loss": 2.3595, "num_input_tokens_seen": 74279711840, "step": 140850 }, { "epoch": 3.0950691004144035, "grad_norm": 0.08657340705394745, "learning_rate": 0.0001, "loss": 2.3511, "num_input_tokens_seen": 74332140640, "step": 140900 }, { "epoch": 3.0961674033304663, "grad_norm": 0.08521311730146408, "learning_rate": 0.0001, "loss": 2.3569, "num_input_tokens_seen": 74384569440, "step": 140950 }, { "epoch": 3.097265706246529, "grad_norm": 0.08738870918750763, "learning_rate": 0.0001, "loss": 2.3587, "num_input_tokens_seen": 74436998240, "step": 141000 }, { "epoch": 3.097265706246529, "eval_loss": 2.269127607345581, "eval_runtime": 80.825, "eval_samples_per_second": 61.862, "eval_steps_per_second": 15.466, "num_input_tokens_seen": 74436998240, "step": 141000 }, { "epoch": 3.098364009162592, "grad_norm": 0.08698341250419617, "learning_rate": 0.0001, "loss": 2.3601, "num_input_tokens_seen": 74489427040, "step": 141050 }, { "epoch": 3.099462312078655, "grad_norm": 0.08720843493938446, "learning_rate": 0.0001, "loss": 2.3535, "num_input_tokens_seen": 74541855840, "step": 141100 }, { "epoch": 3.100560614994718, "grad_norm": 0.09104479104280472, "learning_rate": 0.0001, "loss": 2.3593, "num_input_tokens_seen": 74594280736, "step": 141150 }, { "epoch": 3.1016589179107807, "grad_norm": 0.08937595039606094, "learning_rate": 0.0001, "loss": 2.3598, "num_input_tokens_seen": 74646709536, "step": 141200 }, { "epoch": 3.1027572208268435, "grad_norm": 0.09215135872364044, "learning_rate": 0.0001, "loss": 2.3554, "num_input_tokens_seen": 74699138336, "step": 141250 }, { "epoch": 3.1038555237429066, "grad_norm": 0.09628193080425262, "learning_rate": 0.0001, "loss": 2.3533, "num_input_tokens_seen": 74751567136, "step": 141300 }, { "epoch": 3.1049538266589694, "grad_norm": 0.08902380615472794, "learning_rate": 0.0001, "loss": 2.3624, "num_input_tokens_seen": 74803995936, "step": 141350 }, { "epoch": 3.106052129575032, "grad_norm": 0.08567160367965698, "learning_rate": 0.0001, "loss": 2.3584, "num_input_tokens_seen": 74856421632, "step": 141400 }, { "epoch": 3.1071504324910952, "grad_norm": 0.08796554803848267, "learning_rate": 0.0001, "loss": 2.354, "num_input_tokens_seen": 74908850432, "step": 141450 }, { "epoch": 3.108248735407158, "grad_norm": 0.0918358638882637, "learning_rate": 0.0001, "loss": 2.3482, "num_input_tokens_seen": 74961276896, "step": 141500 }, { "epoch": 3.108248735407158, "eval_loss": 2.2687478065490723, "eval_runtime": 81.2772, "eval_samples_per_second": 61.518, "eval_steps_per_second": 15.379, "num_input_tokens_seen": 74961276896, "step": 141500 }, { "epoch": 3.109347038323221, "grad_norm": 0.09383154660463333, "learning_rate": 0.0001, "loss": 2.3568, "num_input_tokens_seen": 75013705696, "step": 141550 }, { "epoch": 3.110445341239284, "grad_norm": 0.08789625018835068, "learning_rate": 0.0001, "loss": 2.3525, "num_input_tokens_seen": 75066134496, "step": 141600 }, { "epoch": 3.1115436441553466, "grad_norm": 0.08712870627641678, "learning_rate": 0.0001, "loss": 2.3552, "num_input_tokens_seen": 75118563264, "step": 141650 }, { "epoch": 3.1126419470714097, "grad_norm": 0.09325970709323883, "learning_rate": 0.0001, "loss": 2.3563, "num_input_tokens_seen": 75170986784, "step": 141700 }, { "epoch": 3.1137402499874725, "grad_norm": 0.08622899651527405, "learning_rate": 0.0001, "loss": 2.3492, "num_input_tokens_seen": 75223415584, "step": 141750 }, { "epoch": 3.114838552903535, "grad_norm": 0.08862034976482391, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 75275844384, "step": 141800 }, { "epoch": 3.1159368558195983, "grad_norm": 0.09716533124446869, "learning_rate": 0.0001, "loss": 2.3552, "num_input_tokens_seen": 75328273184, "step": 141850 }, { "epoch": 3.117035158735661, "grad_norm": 0.08546283841133118, "learning_rate": 0.0001, "loss": 2.3493, "num_input_tokens_seen": 75380701984, "step": 141900 }, { "epoch": 3.1181334616517242, "grad_norm": 0.0886145830154419, "learning_rate": 0.0001, "loss": 2.3508, "num_input_tokens_seen": 75433130784, "step": 141950 }, { "epoch": 3.119231764567787, "grad_norm": 0.09431352466344833, "learning_rate": 0.0001, "loss": 2.3529, "num_input_tokens_seen": 75485554848, "step": 142000 }, { "epoch": 3.119231764567787, "eval_loss": 2.2686169147491455, "eval_runtime": 80.9243, "eval_samples_per_second": 61.786, "eval_steps_per_second": 15.447, "num_input_tokens_seen": 75485554848, "step": 142000 }, { "epoch": 3.1203300674838497, "grad_norm": 0.09836099296808243, "learning_rate": 0.0001, "loss": 2.3527, "num_input_tokens_seen": 75537983648, "step": 142050 }, { "epoch": 3.121428370399913, "grad_norm": 0.09545739740133286, "learning_rate": 0.0001, "loss": 2.3567, "num_input_tokens_seen": 75590406592, "step": 142100 }, { "epoch": 3.1225266733159756, "grad_norm": 0.09857258945703506, "learning_rate": 0.0001, "loss": 2.3545, "num_input_tokens_seen": 75642835392, "step": 142150 }, { "epoch": 3.1236249762320383, "grad_norm": 0.08820495009422302, "learning_rate": 0.0001, "loss": 2.3463, "num_input_tokens_seen": 75695264192, "step": 142200 }, { "epoch": 3.1247232791481014, "grad_norm": 0.09423326700925827, "learning_rate": 0.0001, "loss": 2.3582, "num_input_tokens_seen": 75747692992, "step": 142250 }, { "epoch": 3.125821582064164, "grad_norm": 0.0870075449347496, "learning_rate": 0.0001, "loss": 2.3584, "num_input_tokens_seen": 75800121792, "step": 142300 }, { "epoch": 3.126919884980227, "grad_norm": 0.09343559294939041, "learning_rate": 0.0001, "loss": 2.3548, "num_input_tokens_seen": 75852547808, "step": 142350 }, { "epoch": 3.12801818789629, "grad_norm": 0.0868043452501297, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 75904976608, "step": 142400 }, { "epoch": 3.1291164908123528, "grad_norm": 0.08903662115335464, "learning_rate": 0.0001, "loss": 2.3551, "num_input_tokens_seen": 75957405408, "step": 142450 }, { "epoch": 3.130214793728416, "grad_norm": 0.09318575263023376, "learning_rate": 0.0001, "loss": 2.3568, "num_input_tokens_seen": 76009834208, "step": 142500 }, { "epoch": 3.130214793728416, "eval_loss": 2.268275260925293, "eval_runtime": 80.0625, "eval_samples_per_second": 62.451, "eval_steps_per_second": 15.613, "num_input_tokens_seen": 76009834208, "step": 142500 }, { "epoch": 3.1313130966444787, "grad_norm": 0.09889211505651474, "learning_rate": 0.0001, "loss": 2.3566, "num_input_tokens_seen": 76062256320, "step": 142550 }, { "epoch": 3.1324113995605414, "grad_norm": 0.0866382047533989, "learning_rate": 0.0001, "loss": 2.3558, "num_input_tokens_seen": 76114685120, "step": 142600 }, { "epoch": 3.1335097024766045, "grad_norm": 0.09349516034126282, "learning_rate": 0.0001, "loss": 2.3588, "num_input_tokens_seen": 76167111584, "step": 142650 }, { "epoch": 3.1346080053926673, "grad_norm": 0.0880579948425293, "learning_rate": 0.0001, "loss": 2.3586, "num_input_tokens_seen": 76219538976, "step": 142700 }, { "epoch": 3.1357063083087304, "grad_norm": 0.08621477335691452, "learning_rate": 0.0001, "loss": 2.3582, "num_input_tokens_seen": 76271966560, "step": 142750 }, { "epoch": 3.136804611224793, "grad_norm": 0.08953891694545746, "learning_rate": 0.0001, "loss": 2.3564, "num_input_tokens_seen": 76324395360, "step": 142800 }, { "epoch": 3.137902914140856, "grad_norm": 0.10092195123434067, "learning_rate": 0.0001, "loss": 2.3583, "num_input_tokens_seen": 76376824160, "step": 142850 }, { "epoch": 3.139001217056919, "grad_norm": 0.08993303775787354, "learning_rate": 0.0001, "loss": 2.349, "num_input_tokens_seen": 76429250784, "step": 142900 }, { "epoch": 3.1400995199729818, "grad_norm": 0.08924704045057297, "learning_rate": 0.0001, "loss": 2.3533, "num_input_tokens_seen": 76481679456, "step": 142950 }, { "epoch": 3.1411978228890445, "grad_norm": 0.09532574564218521, "learning_rate": 0.0001, "loss": 2.3494, "num_input_tokens_seen": 76534108256, "step": 143000 }, { "epoch": 3.1411978228890445, "eval_loss": 2.267223358154297, "eval_runtime": 80.7695, "eval_samples_per_second": 61.905, "eval_steps_per_second": 15.476, "num_input_tokens_seen": 76534108256, "step": 143000 }, { "epoch": 3.1422961258051076, "grad_norm": 0.0887688547372818, "learning_rate": 0.0001, "loss": 2.3614, "num_input_tokens_seen": 76586530944, "step": 143050 }, { "epoch": 3.1433944287211704, "grad_norm": 0.09098675847053528, "learning_rate": 0.0001, "loss": 2.3579, "num_input_tokens_seen": 76638954624, "step": 143100 }, { "epoch": 3.144492731637233, "grad_norm": 0.0879465788602829, "learning_rate": 0.0001, "loss": 2.3544, "num_input_tokens_seen": 76691382816, "step": 143150 }, { "epoch": 3.1455910345532963, "grad_norm": 0.09020204097032547, "learning_rate": 0.0001, "loss": 2.3608, "num_input_tokens_seen": 76743811616, "step": 143200 }, { "epoch": 3.146689337469359, "grad_norm": 0.0859370082616806, "learning_rate": 0.0001, "loss": 2.3572, "num_input_tokens_seen": 76796240416, "step": 143250 }, { "epoch": 3.147787640385422, "grad_norm": 0.08659014105796814, "learning_rate": 0.0001, "loss": 2.3513, "num_input_tokens_seen": 76848663776, "step": 143300 }, { "epoch": 3.148885943301485, "grad_norm": 0.09119068831205368, "learning_rate": 0.0001, "loss": 2.3591, "num_input_tokens_seen": 76901090432, "step": 143350 }, { "epoch": 3.1499842462175476, "grad_norm": 0.08679833263158798, "learning_rate": 0.0001, "loss": 2.3462, "num_input_tokens_seen": 76953519232, "step": 143400 }, { "epoch": 3.1510825491336107, "grad_norm": 0.09158121794462204, "learning_rate": 0.0001, "loss": 2.354, "num_input_tokens_seen": 77005948032, "step": 143450 }, { "epoch": 3.1521808520496735, "grad_norm": 0.08906587958335876, "learning_rate": 0.0001, "loss": 2.3582, "num_input_tokens_seen": 77058376736, "step": 143500 }, { "epoch": 3.1521808520496735, "eval_loss": 2.2670507431030273, "eval_runtime": 80.7537, "eval_samples_per_second": 61.917, "eval_steps_per_second": 15.479, "num_input_tokens_seen": 77058376736, "step": 143500 }, { "epoch": 3.153279154965736, "grad_norm": 0.09259413182735443, "learning_rate": 0.0001, "loss": 2.3501, "num_input_tokens_seen": 77110805536, "step": 143550 }, { "epoch": 3.1543774578817994, "grad_norm": 0.08947575837373734, "learning_rate": 0.0001, "loss": 2.3595, "num_input_tokens_seen": 77163233984, "step": 143600 }, { "epoch": 3.155475760797862, "grad_norm": 0.09320012480020523, "learning_rate": 0.0001, "loss": 2.3566, "num_input_tokens_seen": 77215662784, "step": 143650 }, { "epoch": 3.1565740637139252, "grad_norm": 0.09630698710680008, "learning_rate": 0.0001, "loss": 2.3619, "num_input_tokens_seen": 77268081952, "step": 143700 }, { "epoch": 3.157672366629988, "grad_norm": 0.08778363466262817, "learning_rate": 0.0001, "loss": 2.3565, "num_input_tokens_seen": 77320510752, "step": 143750 }, { "epoch": 3.1587706695460507, "grad_norm": 0.08947426080703735, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 77372937824, "step": 143800 }, { "epoch": 3.159868972462114, "grad_norm": 0.09249094128608704, "learning_rate": 0.0001, "loss": 2.3508, "num_input_tokens_seen": 77425366624, "step": 143850 }, { "epoch": 3.1609672753781766, "grad_norm": 0.09168736636638641, "learning_rate": 0.0001, "loss": 2.3522, "num_input_tokens_seen": 77477795424, "step": 143900 }, { "epoch": 3.1620655782942393, "grad_norm": 0.08995141088962555, "learning_rate": 0.0001, "loss": 2.3506, "num_input_tokens_seen": 77530222464, "step": 143950 }, { "epoch": 3.1631638812103025, "grad_norm": 0.09521841257810593, "learning_rate": 0.0001, "loss": 2.3521, "num_input_tokens_seen": 77582650016, "step": 144000 }, { "epoch": 3.1631638812103025, "eval_loss": 2.266408920288086, "eval_runtime": 80.1189, "eval_samples_per_second": 62.407, "eval_steps_per_second": 15.602, "num_input_tokens_seen": 77582650016, "step": 144000 }, { "epoch": 3.164262184126365, "grad_norm": 0.11045259982347488, "learning_rate": 0.0001, "loss": 2.3539, "num_input_tokens_seen": 77635078816, "step": 144050 }, { "epoch": 3.1653604870424283, "grad_norm": 0.09217249602079391, "learning_rate": 0.0001, "loss": 2.343, "num_input_tokens_seen": 77687507616, "step": 144100 }, { "epoch": 3.166458789958491, "grad_norm": 0.09735982865095139, "learning_rate": 0.0001, "loss": 2.3519, "num_input_tokens_seen": 77739935872, "step": 144150 }, { "epoch": 3.167557092874554, "grad_norm": 0.08962323516607285, "learning_rate": 0.0001, "loss": 2.3473, "num_input_tokens_seen": 77792364672, "step": 144200 }, { "epoch": 3.168655395790617, "grad_norm": 0.09209229052066803, "learning_rate": 0.0001, "loss": 2.3555, "num_input_tokens_seen": 77844789440, "step": 144250 }, { "epoch": 3.1697536987066797, "grad_norm": 0.09181981533765793, "learning_rate": 0.0001, "loss": 2.348, "num_input_tokens_seen": 77897216224, "step": 144300 }, { "epoch": 3.1708520016227424, "grad_norm": 0.0833621546626091, "learning_rate": 0.0001, "loss": 2.3454, "num_input_tokens_seen": 77949645024, "step": 144350 }, { "epoch": 3.1719503045388056, "grad_norm": 0.09470181167125702, "learning_rate": 0.0001, "loss": 2.3491, "num_input_tokens_seen": 78002073440, "step": 144400 }, { "epoch": 3.1730486074548683, "grad_norm": 0.09017711132764816, "learning_rate": 0.0001, "loss": 2.3545, "num_input_tokens_seen": 78054502240, "step": 144450 }, { "epoch": 3.174146910370931, "grad_norm": 0.09220907092094421, "learning_rate": 0.0001, "loss": 2.3508, "num_input_tokens_seen": 78106920224, "step": 144500 }, { "epoch": 3.174146910370931, "eval_loss": 2.2662734985351562, "eval_runtime": 80.1838, "eval_samples_per_second": 62.357, "eval_steps_per_second": 15.589, "num_input_tokens_seen": 78106920224, "step": 144500 }, { "epoch": 3.175245213286994, "grad_norm": 0.0899912640452385, "learning_rate": 0.0001, "loss": 2.3607, "num_input_tokens_seen": 78159346816, "step": 144550 }, { "epoch": 3.176343516203057, "grad_norm": 0.09542039781808853, "learning_rate": 0.0001, "loss": 2.3405, "num_input_tokens_seen": 78211775616, "step": 144600 }, { "epoch": 3.17744181911912, "grad_norm": 0.09049233049154282, "learning_rate": 0.0001, "loss": 2.3483, "num_input_tokens_seen": 78264204032, "step": 144650 }, { "epoch": 3.1785401220351828, "grad_norm": 0.09777910262346268, "learning_rate": 0.0001, "loss": 2.3514, "num_input_tokens_seen": 78316631456, "step": 144700 }, { "epoch": 3.1796384249512455, "grad_norm": 0.09022051095962524, "learning_rate": 0.0001, "loss": 2.3531, "num_input_tokens_seen": 78369060192, "step": 144750 }, { "epoch": 3.1807367278673087, "grad_norm": 0.09529806673526764, "learning_rate": 0.0001, "loss": 2.3531, "num_input_tokens_seen": 78421488992, "step": 144800 }, { "epoch": 3.1818350307833714, "grad_norm": 0.08957728743553162, "learning_rate": 0.0001, "loss": 2.3554, "num_input_tokens_seen": 78473915424, "step": 144850 }, { "epoch": 3.182933333699434, "grad_norm": 0.09089584648609161, "learning_rate": 0.0001, "loss": 2.34, "num_input_tokens_seen": 78526342656, "step": 144900 }, { "epoch": 3.1840316366154973, "grad_norm": 0.09180238097906113, "learning_rate": 0.0001, "loss": 2.3542, "num_input_tokens_seen": 78578771456, "step": 144950 }, { "epoch": 3.18512993953156, "grad_norm": 0.09355945885181427, "learning_rate": 0.0001, "loss": 2.3581, "num_input_tokens_seen": 78631193376, "step": 145000 }, { "epoch": 3.18512993953156, "eval_loss": 2.2656116485595703, "eval_runtime": 80.4625, "eval_samples_per_second": 62.141, "eval_steps_per_second": 15.535, "num_input_tokens_seen": 78631193376, "step": 145000 }, { "epoch": 3.186228242447623, "grad_norm": 0.09815175086259842, "learning_rate": 0.0001, "loss": 2.3512, "num_input_tokens_seen": 78683622176, "step": 145050 }, { "epoch": 3.187326545363686, "grad_norm": 0.08802150189876556, "learning_rate": 0.0001, "loss": 2.352, "num_input_tokens_seen": 78736048064, "step": 145100 }, { "epoch": 3.1884248482797486, "grad_norm": 0.09322452545166016, "learning_rate": 0.0001, "loss": 2.3517, "num_input_tokens_seen": 78788476864, "step": 145150 }, { "epoch": 3.1895231511958118, "grad_norm": 0.09273571521043777, "learning_rate": 0.0001, "loss": 2.3525, "num_input_tokens_seen": 78840905664, "step": 145200 }, { "epoch": 3.1906214541118745, "grad_norm": 0.09510285407304764, "learning_rate": 0.0001, "loss": 2.3492, "num_input_tokens_seen": 78893334464, "step": 145250 }, { "epoch": 3.191719757027937, "grad_norm": 0.09299364686012268, "learning_rate": 0.0001, "loss": 2.3485, "num_input_tokens_seen": 78945760672, "step": 145300 }, { "epoch": 3.1928180599440004, "grad_norm": 0.08893129974603653, "learning_rate": 0.0001, "loss": 2.3529, "num_input_tokens_seen": 78998188864, "step": 145350 }, { "epoch": 3.193916362860063, "grad_norm": 0.08949380367994308, "learning_rate": 0.0001, "loss": 2.3563, "num_input_tokens_seen": 79050616800, "step": 145400 }, { "epoch": 3.1950146657761263, "grad_norm": 0.09386907517910004, "learning_rate": 0.0001, "loss": 2.3454, "num_input_tokens_seen": 79103043296, "step": 145450 }, { "epoch": 3.196112968692189, "grad_norm": 0.10135660320520401, "learning_rate": 0.0001, "loss": 2.3521, "num_input_tokens_seen": 79155472096, "step": 145500 }, { "epoch": 3.196112968692189, "eval_loss": 2.2653002738952637, "eval_runtime": 80.3673, "eval_samples_per_second": 62.214, "eval_steps_per_second": 15.554, "num_input_tokens_seen": 79155472096, "step": 145500 }, { "epoch": 3.1972112716082517, "grad_norm": 0.09017980098724365, "learning_rate": 0.0001, "loss": 2.3576, "num_input_tokens_seen": 79207895552, "step": 145550 }, { "epoch": 3.198309574524315, "grad_norm": 0.10214512050151825, "learning_rate": 0.0001, "loss": 2.3512, "num_input_tokens_seen": 79260320832, "step": 145600 }, { "epoch": 3.1994078774403776, "grad_norm": 0.08758047223091125, "learning_rate": 0.0001, "loss": 2.347, "num_input_tokens_seen": 79312749152, "step": 145650 }, { "epoch": 3.2005061803564403, "grad_norm": 0.09964236617088318, "learning_rate": 0.0001, "loss": 2.3466, "num_input_tokens_seen": 79365177440, "step": 145700 }, { "epoch": 3.2016044832725035, "grad_norm": 0.10335622727870941, "learning_rate": 0.0001, "loss": 2.3492, "num_input_tokens_seen": 79417604928, "step": 145750 }, { "epoch": 3.202702786188566, "grad_norm": 0.09493719041347504, "learning_rate": 0.0001, "loss": 2.3547, "num_input_tokens_seen": 79470031232, "step": 145800 }, { "epoch": 3.203801089104629, "grad_norm": 0.0902167409658432, "learning_rate": 0.0001, "loss": 2.3449, "num_input_tokens_seen": 79522455456, "step": 145850 }, { "epoch": 3.204899392020692, "grad_norm": 0.09135492146015167, "learning_rate": 0.0001, "loss": 2.3535, "num_input_tokens_seen": 79574884256, "step": 145900 }, { "epoch": 3.205997694936755, "grad_norm": 0.09141404926776886, "learning_rate": 0.0001, "loss": 2.3482, "num_input_tokens_seen": 79627304576, "step": 145950 }, { "epoch": 3.207095997852818, "grad_norm": 0.08859889209270477, "learning_rate": 0.0001, "loss": 2.3585, "num_input_tokens_seen": 79679733376, "step": 146000 }, { "epoch": 3.207095997852818, "eval_loss": 2.264711618423462, "eval_runtime": 80.4702, "eval_samples_per_second": 62.135, "eval_steps_per_second": 15.534, "num_input_tokens_seen": 79679733376, "step": 146000 }, { "epoch": 3.2081943007688807, "grad_norm": 0.0960359126329422, "learning_rate": 0.0001, "loss": 2.3526, "num_input_tokens_seen": 79732162176, "step": 146050 }, { "epoch": 3.2092926036849434, "grad_norm": 0.09091591835021973, "learning_rate": 0.0001, "loss": 2.3518, "num_input_tokens_seen": 79784590976, "step": 146100 }, { "epoch": 3.2103909066010066, "grad_norm": 0.09111624211072922, "learning_rate": 0.0001, "loss": 2.3491, "num_input_tokens_seen": 79837019776, "step": 146150 }, { "epoch": 3.2114892095170693, "grad_norm": 0.08795958012342453, "learning_rate": 0.0001, "loss": 2.3624, "num_input_tokens_seen": 79889448576, "step": 146200 }, { "epoch": 3.2125875124331325, "grad_norm": 0.09486380219459534, "learning_rate": 0.0001, "loss": 2.3526, "num_input_tokens_seen": 79941874336, "step": 146250 }, { "epoch": 3.213685815349195, "grad_norm": 0.08702174574136734, "learning_rate": 0.0001, "loss": 2.3503, "num_input_tokens_seen": 79994303136, "step": 146300 }, { "epoch": 3.214784118265258, "grad_norm": 0.09130252152681351, "learning_rate": 0.0001, "loss": 2.3524, "num_input_tokens_seen": 80046731936, "step": 146350 }, { "epoch": 3.215882421181321, "grad_norm": 0.09093622118234634, "learning_rate": 0.0001, "loss": 2.3467, "num_input_tokens_seen": 80099160736, "step": 146400 }, { "epoch": 3.216980724097384, "grad_norm": 0.09483332931995392, "learning_rate": 0.0001, "loss": 2.3492, "num_input_tokens_seen": 80151589536, "step": 146450 }, { "epoch": 3.2180790270134465, "grad_norm": 0.09806844592094421, "learning_rate": 0.0001, "loss": 2.3435, "num_input_tokens_seen": 80204017984, "step": 146500 }, { "epoch": 3.2180790270134465, "eval_loss": 2.264453172683716, "eval_runtime": 80.343, "eval_samples_per_second": 62.233, "eval_steps_per_second": 15.558, "num_input_tokens_seen": 80204017984, "step": 146500 }, { "epoch": 3.2191773299295097, "grad_norm": 0.09445874392986298, "learning_rate": 0.0001, "loss": 2.3395, "num_input_tokens_seen": 80256446784, "step": 146550 }, { "epoch": 3.2202756328455724, "grad_norm": 0.09850721806287766, "learning_rate": 0.0001, "loss": 2.3503, "num_input_tokens_seen": 80308875296, "step": 146600 }, { "epoch": 3.221373935761635, "grad_norm": 0.08922294527292252, "learning_rate": 0.0001, "loss": 2.3568, "num_input_tokens_seen": 80361299648, "step": 146650 }, { "epoch": 3.2224722386776983, "grad_norm": 0.09347110241651535, "learning_rate": 0.0001, "loss": 2.3486, "num_input_tokens_seen": 80413717120, "step": 146700 }, { "epoch": 3.223570541593761, "grad_norm": 0.09082797914743423, "learning_rate": 0.0001, "loss": 2.3525, "num_input_tokens_seen": 80466140832, "step": 146750 }, { "epoch": 3.224668844509824, "grad_norm": 0.09167880564928055, "learning_rate": 0.0001, "loss": 2.3541, "num_input_tokens_seen": 80518568704, "step": 146800 }, { "epoch": 3.225767147425887, "grad_norm": 0.09498349577188492, "learning_rate": 0.0001, "loss": 2.3498, "num_input_tokens_seen": 80570997024, "step": 146850 }, { "epoch": 3.2268654503419496, "grad_norm": 0.09598004817962646, "learning_rate": 0.0001, "loss": 2.3539, "num_input_tokens_seen": 80623425824, "step": 146900 }, { "epoch": 3.2279637532580128, "grad_norm": 0.09606627374887466, "learning_rate": 0.0001, "loss": 2.3537, "num_input_tokens_seen": 80675854624, "step": 146950 }, { "epoch": 3.2290620561740755, "grad_norm": 0.08999280631542206, "learning_rate": 0.0001, "loss": 2.3512, "num_input_tokens_seen": 80728283424, "step": 147000 }, { "epoch": 3.2290620561740755, "eval_loss": 2.2641336917877197, "eval_runtime": 80.7221, "eval_samples_per_second": 61.941, "eval_steps_per_second": 15.485, "num_input_tokens_seen": 80728283424, "step": 147000 }, { "epoch": 3.230160359090138, "grad_norm": 0.09034433215856552, "learning_rate": 0.0001, "loss": 2.3469, "num_input_tokens_seen": 80780712224, "step": 147050 }, { "epoch": 3.2312586620062014, "grad_norm": 0.08906085789203644, "learning_rate": 0.0001, "loss": 2.3556, "num_input_tokens_seen": 80833139616, "step": 147100 }, { "epoch": 3.232356964922264, "grad_norm": 0.08581134676933289, "learning_rate": 0.0001, "loss": 2.3549, "num_input_tokens_seen": 80885568416, "step": 147150 }, { "epoch": 3.2334552678383273, "grad_norm": 0.08810413628816605, "learning_rate": 0.0001, "loss": 2.3521, "num_input_tokens_seen": 80937997216, "step": 147200 }, { "epoch": 3.23455357075439, "grad_norm": 0.0902443677186966, "learning_rate": 0.0001, "loss": 2.3495, "num_input_tokens_seen": 80990426016, "step": 147250 }, { "epoch": 3.2356518736704527, "grad_norm": 0.09435313940048218, "learning_rate": 0.0001, "loss": 2.3535, "num_input_tokens_seen": 81042854816, "step": 147300 }, { "epoch": 3.236750176586516, "grad_norm": 0.09080259501934052, "learning_rate": 0.0001, "loss": 2.3562, "num_input_tokens_seen": 81095283616, "step": 147350 }, { "epoch": 3.2378484795025786, "grad_norm": 0.0864415243268013, "learning_rate": 0.0001, "loss": 2.352, "num_input_tokens_seen": 81147710304, "step": 147400 }, { "epoch": 3.2389467824186413, "grad_norm": 0.09428194910287857, "learning_rate": 0.0001, "loss": 2.3461, "num_input_tokens_seen": 81200139104, "step": 147450 }, { "epoch": 3.2400450853347045, "grad_norm": 0.08963849395513535, "learning_rate": 0.0001, "loss": 2.3441, "num_input_tokens_seen": 81252567904, "step": 147500 }, { "epoch": 3.2400450853347045, "eval_loss": 2.263394832611084, "eval_runtime": 80.123, "eval_samples_per_second": 62.404, "eval_steps_per_second": 15.601, "num_input_tokens_seen": 81252567904, "step": 147500 }, { "epoch": 3.241143388250767, "grad_norm": 0.09004587680101395, "learning_rate": 0.0001, "loss": 2.3565, "num_input_tokens_seen": 81304996704, "step": 147550 }, { "epoch": 3.2422416911668304, "grad_norm": 0.08573032915592194, "learning_rate": 0.0001, "loss": 2.3553, "num_input_tokens_seen": 81357422528, "step": 147600 }, { "epoch": 3.243339994082893, "grad_norm": 0.0909392312169075, "learning_rate": 0.0001, "loss": 2.3466, "num_input_tokens_seen": 81409846656, "step": 147650 }, { "epoch": 3.244438296998956, "grad_norm": 0.1009620726108551, "learning_rate": 0.0001, "loss": 2.356, "num_input_tokens_seen": 81462275456, "step": 147700 }, { "epoch": 3.245536599915019, "grad_norm": 0.08916173875331879, "learning_rate": 0.0001, "loss": 2.3465, "num_input_tokens_seen": 81514704256, "step": 147750 }, { "epoch": 3.2466349028310817, "grad_norm": 0.09659174829721451, "learning_rate": 0.0001, "loss": 2.3415, "num_input_tokens_seen": 81567132320, "step": 147800 }, { "epoch": 3.2477332057471444, "grad_norm": 0.09102753549814224, "learning_rate": 0.0001, "loss": 2.3471, "num_input_tokens_seen": 81619559456, "step": 147850 }, { "epoch": 3.2488315086632076, "grad_norm": 0.09342406690120697, "learning_rate": 0.0001, "loss": 2.3511, "num_input_tokens_seen": 81671988256, "step": 147900 }, { "epoch": 3.2499298115792703, "grad_norm": 0.08837909251451492, "learning_rate": 0.0001, "loss": 2.3498, "num_input_tokens_seen": 81724417056, "step": 147950 }, { "epoch": 3.251028114495333, "grad_norm": 0.09298506379127502, "learning_rate": 0.0001, "loss": 2.3542, "num_input_tokens_seen": 81776845856, "step": 148000 }, { "epoch": 3.251028114495333, "eval_loss": 2.2628493309020996, "eval_runtime": 80.0409, "eval_samples_per_second": 62.468, "eval_steps_per_second": 15.617, "num_input_tokens_seen": 81776845856, "step": 148000 }, { "epoch": 3.252126417411396, "grad_norm": 0.09104160964488983, "learning_rate": 0.0001, "loss": 2.3493, "num_input_tokens_seen": 81829274656, "step": 148050 }, { "epoch": 3.253224720327459, "grad_norm": 0.09389001131057739, "learning_rate": 0.0001, "loss": 2.3517, "num_input_tokens_seen": 81881698752, "step": 148100 }, { "epoch": 3.254323023243522, "grad_norm": 0.09085691720247269, "learning_rate": 0.0001, "loss": 2.3449, "num_input_tokens_seen": 81934124352, "step": 148150 }, { "epoch": 3.255421326159585, "grad_norm": 0.09462492913007736, "learning_rate": 0.0001, "loss": 2.3535, "num_input_tokens_seen": 81986553152, "step": 148200 }, { "epoch": 3.2565196290756475, "grad_norm": 0.09079829603433609, "learning_rate": 0.0001, "loss": 2.3477, "num_input_tokens_seen": 82038981952, "step": 148250 }, { "epoch": 3.2576179319917107, "grad_norm": 0.09165850281715393, "learning_rate": 0.0001, "loss": 2.3548, "num_input_tokens_seen": 82091407040, "step": 148300 }, { "epoch": 3.2587162349077734, "grad_norm": 0.09388460963964462, "learning_rate": 0.0001, "loss": 2.3477, "num_input_tokens_seen": 82143834912, "step": 148350 }, { "epoch": 3.2598145378238366, "grad_norm": 0.09309230744838715, "learning_rate": 0.0001, "loss": 2.3506, "num_input_tokens_seen": 82196262432, "step": 148400 }, { "epoch": 3.2609128407398993, "grad_norm": 0.09229093790054321, "learning_rate": 0.0001, "loss": 2.3451, "num_input_tokens_seen": 82248691232, "step": 148450 }, { "epoch": 3.262011143655962, "grad_norm": 0.09784185886383057, "learning_rate": 0.0001, "loss": 2.3447, "num_input_tokens_seen": 82301120032, "step": 148500 }, { "epoch": 3.262011143655962, "eval_loss": 2.2628486156463623, "eval_runtime": 80.5596, "eval_samples_per_second": 62.066, "eval_steps_per_second": 15.516, "num_input_tokens_seen": 82301120032, "step": 148500 }, { "epoch": 3.263109446572025, "grad_norm": 0.09117227792739868, "learning_rate": 0.0001, "loss": 2.3541, "num_input_tokens_seen": 82353545696, "step": 148550 }, { "epoch": 3.264207749488088, "grad_norm": 0.09306716918945312, "learning_rate": 0.0001, "loss": 2.3522, "num_input_tokens_seen": 82405974496, "step": 148600 }, { "epoch": 3.2653060524041506, "grad_norm": 0.0911402553319931, "learning_rate": 0.0001, "loss": 2.3479, "num_input_tokens_seen": 82458403296, "step": 148650 }, { "epoch": 3.266404355320214, "grad_norm": 0.09247133135795593, "learning_rate": 0.0001, "loss": 2.3459, "num_input_tokens_seen": 82510832096, "step": 148700 }, { "epoch": 3.2675026582362765, "grad_norm": 0.08942971378564835, "learning_rate": 0.0001, "loss": 2.3442, "num_input_tokens_seen": 82563260896, "step": 148750 }, { "epoch": 3.2686009611523392, "grad_norm": 0.09245148301124573, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 82615689600, "step": 148800 }, { "epoch": 3.2696992640684024, "grad_norm": 0.09015721827745438, "learning_rate": 0.0001, "loss": 2.3502, "num_input_tokens_seen": 82668116128, "step": 148850 }, { "epoch": 3.270797566984465, "grad_norm": 0.1070784255862236, "learning_rate": 0.0001, "loss": 2.3561, "num_input_tokens_seen": 82720544928, "step": 148900 }, { "epoch": 3.2718958699005283, "grad_norm": 0.09262741357088089, "learning_rate": 0.0001, "loss": 2.343, "num_input_tokens_seen": 82772973728, "step": 148950 }, { "epoch": 3.272994172816591, "grad_norm": 0.0885642021894455, "learning_rate": 0.0001, "loss": 2.3477, "num_input_tokens_seen": 82825402528, "step": 149000 }, { "epoch": 3.272994172816591, "eval_loss": 2.262723684310913, "eval_runtime": 80.9161, "eval_samples_per_second": 61.792, "eval_steps_per_second": 15.448, "num_input_tokens_seen": 82825402528, "step": 149000 }, { "epoch": 3.2740924757326537, "grad_norm": 0.09636425226926804, "learning_rate": 0.0001, "loss": 2.3512, "num_input_tokens_seen": 82877831328, "step": 149050 }, { "epoch": 3.275190778648717, "grad_norm": 0.09321967512369156, "learning_rate": 0.0001, "loss": 2.3464, "num_input_tokens_seen": 82930260128, "step": 149100 }, { "epoch": 3.2762890815647796, "grad_norm": 0.09646619111299515, "learning_rate": 0.0001, "loss": 2.3477, "num_input_tokens_seen": 82982684224, "step": 149150 }, { "epoch": 3.2773873844808423, "grad_norm": 0.09394371509552002, "learning_rate": 0.0001, "loss": 2.3436, "num_input_tokens_seen": 83035113024, "step": 149200 }, { "epoch": 3.2784856873969055, "grad_norm": 0.09308401495218277, "learning_rate": 0.0001, "loss": 2.3543, "num_input_tokens_seen": 83087541664, "step": 149250 }, { "epoch": 3.279583990312968, "grad_norm": 0.09759179502725601, "learning_rate": 0.0001, "loss": 2.3418, "num_input_tokens_seen": 83139969344, "step": 149300 }, { "epoch": 3.280682293229031, "grad_norm": 0.0901869386434555, "learning_rate": 0.0001, "loss": 2.3484, "num_input_tokens_seen": 83192395872, "step": 149350 }, { "epoch": 3.281780596145094, "grad_norm": 0.0878322646021843, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 83244824672, "step": 149400 }, { "epoch": 3.282878899061157, "grad_norm": 0.09223738312721252, "learning_rate": 0.0001, "loss": 2.3505, "num_input_tokens_seen": 83297245472, "step": 149450 }, { "epoch": 3.28397720197722, "grad_norm": 0.0960436537861824, "learning_rate": 0.0001, "loss": 2.348, "num_input_tokens_seen": 83349674016, "step": 149500 }, { "epoch": 3.28397720197722, "eval_loss": 2.2621405124664307, "eval_runtime": 80.8246, "eval_samples_per_second": 61.862, "eval_steps_per_second": 15.466, "num_input_tokens_seen": 83349674016, "step": 149500 }, { "epoch": 3.2850755048932827, "grad_norm": 0.09093570709228516, "learning_rate": 0.0001, "loss": 2.3531, "num_input_tokens_seen": 83402102304, "step": 149550 }, { "epoch": 3.2861738078093454, "grad_norm": 0.09345680475234985, "learning_rate": 0.0001, "loss": 2.3425, "num_input_tokens_seen": 83454531104, "step": 149600 }, { "epoch": 3.2872721107254086, "grad_norm": 0.09687989205121994, "learning_rate": 0.0001, "loss": 2.3467, "num_input_tokens_seen": 83506956512, "step": 149650 }, { "epoch": 3.2883704136414713, "grad_norm": 0.09994108229875565, "learning_rate": 0.0001, "loss": 2.3526, "num_input_tokens_seen": 83559385312, "step": 149700 }, { "epoch": 3.2894687165575345, "grad_norm": 0.0894683226943016, "learning_rate": 0.0001, "loss": 2.3545, "num_input_tokens_seen": 83611807424, "step": 149750 }, { "epoch": 3.290567019473597, "grad_norm": 0.08919432759284973, "learning_rate": 0.0001, "loss": 2.3375, "num_input_tokens_seen": 83664236224, "step": 149800 }, { "epoch": 3.29166532238966, "grad_norm": 0.0880848690867424, "learning_rate": 0.0001, "loss": 2.3411, "num_input_tokens_seen": 83716662592, "step": 149850 }, { "epoch": 3.292763625305723, "grad_norm": 0.09474777430295944, "learning_rate": 0.0001, "loss": 2.3454, "num_input_tokens_seen": 83769091072, "step": 149900 }, { "epoch": 3.293861928221786, "grad_norm": 0.09384060651063919, "learning_rate": 0.0001, "loss": 2.3532, "num_input_tokens_seen": 83821518880, "step": 149950 }, { "epoch": 3.2949602311378485, "grad_norm": 0.09099478274583817, "learning_rate": 0.0001, "loss": 2.345, "num_input_tokens_seen": 83873945408, "step": 150000 }, { "epoch": 3.2949602311378485, "eval_loss": 2.2621686458587646, "eval_runtime": 80.2172, "eval_samples_per_second": 62.331, "eval_steps_per_second": 15.583, "num_input_tokens_seen": 83873945408, "step": 150000 }, { "epoch": 3.2960585340539117, "grad_norm": 0.0940065085887909, "learning_rate": 0.0001, "loss": 2.3491, "num_input_tokens_seen": 83926374208, "step": 150050 }, { "epoch": 3.2971568369699744, "grad_norm": 0.09787734597921371, "learning_rate": 0.0001, "loss": 2.3445, "num_input_tokens_seen": 83978803008, "step": 150100 }, { "epoch": 3.298255139886037, "grad_norm": 0.0898478627204895, "learning_rate": 0.0001, "loss": 2.3487, "num_input_tokens_seen": 84031230624, "step": 150150 }, { "epoch": 3.2993534428021003, "grad_norm": 0.09385386109352112, "learning_rate": 0.0001, "loss": 2.352, "num_input_tokens_seen": 84083659424, "step": 150200 }, { "epoch": 3.300451745718163, "grad_norm": 0.09281744062900543, "learning_rate": 0.0001, "loss": 2.3547, "num_input_tokens_seen": 84136088224, "step": 150250 }, { "epoch": 3.301550048634226, "grad_norm": 0.0919499471783638, "learning_rate": 0.0001, "loss": 2.3461, "num_input_tokens_seen": 84188516096, "step": 150300 }, { "epoch": 3.302648351550289, "grad_norm": 0.09604910016059875, "learning_rate": 0.0001, "loss": 2.35, "num_input_tokens_seen": 84240944896, "step": 150350 }, { "epoch": 3.3037466544663516, "grad_norm": 0.09299291670322418, "learning_rate": 0.0001, "loss": 2.3541, "num_input_tokens_seen": 84293373696, "step": 150400 }, { "epoch": 3.304844957382415, "grad_norm": 0.09353625029325485, "learning_rate": 0.0001, "loss": 2.3482, "num_input_tokens_seen": 84345802496, "step": 150450 }, { "epoch": 3.3059432602984775, "grad_norm": 0.09208831191062927, "learning_rate": 0.0001, "loss": 2.3429, "num_input_tokens_seen": 84398231296, "step": 150500 }, { "epoch": 3.3059432602984775, "eval_loss": 2.2615652084350586, "eval_runtime": 80.4386, "eval_samples_per_second": 62.159, "eval_steps_per_second": 15.54, "num_input_tokens_seen": 84398231296, "step": 150500 }, { "epoch": 3.3070415632145407, "grad_norm": 0.09018178284168243, "learning_rate": 0.0001, "loss": 2.3478, "num_input_tokens_seen": 84450660096, "step": 150550 }, { "epoch": 3.3081398661306034, "grad_norm": 0.09050137549638748, "learning_rate": 0.0001, "loss": 2.3468, "num_input_tokens_seen": 84503088896, "step": 150600 }, { "epoch": 3.309238169046666, "grad_norm": 0.10714724659919739, "learning_rate": 0.0001, "loss": 2.35, "num_input_tokens_seen": 84555517696, "step": 150650 }, { "epoch": 3.310336471962729, "grad_norm": 0.09455420076847076, "learning_rate": 0.0001, "loss": 2.3506, "num_input_tokens_seen": 84607937664, "step": 150700 }, { "epoch": 3.311434774878792, "grad_norm": 0.0927010029554367, "learning_rate": 0.0001, "loss": 2.3494, "num_input_tokens_seen": 84660366464, "step": 150750 }, { "epoch": 3.3125330777948547, "grad_norm": 0.09752020984888077, "learning_rate": 0.0001, "loss": 2.3451, "num_input_tokens_seen": 84712791744, "step": 150800 }, { "epoch": 3.313631380710918, "grad_norm": 0.094636932015419, "learning_rate": 0.0001, "loss": 2.3478, "num_input_tokens_seen": 84765217024, "step": 150850 }, { "epoch": 3.3147296836269806, "grad_norm": 0.09965428709983826, "learning_rate": 0.0001, "loss": 2.3493, "num_input_tokens_seen": 84817642304, "step": 150900 }, { "epoch": 3.3158279865430433, "grad_norm": 0.0947231873869896, "learning_rate": 0.0001, "loss": 2.3495, "num_input_tokens_seen": 84870071104, "step": 150950 }, { "epoch": 3.3169262894591065, "grad_norm": 0.09301018714904785, "learning_rate": 0.0001, "loss": 2.3476, "num_input_tokens_seen": 84922496000, "step": 151000 }, { "epoch": 3.3169262894591065, "eval_loss": 2.261502504348755, "eval_runtime": 80.5453, "eval_samples_per_second": 62.077, "eval_steps_per_second": 15.519, "num_input_tokens_seen": 84922496000, "step": 151000 }, { "epoch": 3.3180245923751692, "grad_norm": 0.09862314164638519, "learning_rate": 0.0001, "loss": 2.3591, "num_input_tokens_seen": 84974923744, "step": 151050 }, { "epoch": 3.3191228952912324, "grad_norm": 0.09108031541109085, "learning_rate": 0.0001, "loss": 2.348, "num_input_tokens_seen": 85027352544, "step": 151100 }, { "epoch": 3.320221198207295, "grad_norm": 0.10582481324672699, "learning_rate": 0.0001, "loss": 2.3405, "num_input_tokens_seen": 85079779360, "step": 151150 }, { "epoch": 3.321319501123358, "grad_norm": 0.09384915232658386, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 85132203008, "step": 151200 }, { "epoch": 3.322417804039421, "grad_norm": 0.09305764734745026, "learning_rate": 0.0001, "loss": 2.3537, "num_input_tokens_seen": 85184631808, "step": 151250 }, { "epoch": 3.3235161069554837, "grad_norm": 0.09376391023397446, "learning_rate": 0.0001, "loss": 2.345, "num_input_tokens_seen": 85237060608, "step": 151300 }, { "epoch": 3.3246144098715464, "grad_norm": 0.09613306075334549, "learning_rate": 0.0001, "loss": 2.3462, "num_input_tokens_seen": 85289487488, "step": 151350 }, { "epoch": 3.3257127127876096, "grad_norm": 0.09579232335090637, "learning_rate": 0.0001, "loss": 2.3522, "num_input_tokens_seen": 85341909344, "step": 151400 }, { "epoch": 3.3268110157036723, "grad_norm": 0.08879666030406952, "learning_rate": 0.0001, "loss": 2.353, "num_input_tokens_seen": 85394338144, "step": 151450 }, { "epoch": 3.327909318619735, "grad_norm": 0.09744927287101746, "learning_rate": 0.0001, "loss": 2.3521, "num_input_tokens_seen": 85446766432, "step": 151500 }, { "epoch": 3.327909318619735, "eval_loss": 2.26111102104187, "eval_runtime": 79.9827, "eval_samples_per_second": 62.514, "eval_steps_per_second": 15.628, "num_input_tokens_seen": 85446766432, "step": 151500 }, { "epoch": 3.329007621535798, "grad_norm": 0.0889928862452507, "learning_rate": 0.0001, "loss": 2.3436, "num_input_tokens_seen": 85499195232, "step": 151550 }, { "epoch": 3.330105924451861, "grad_norm": 0.09036596864461899, "learning_rate": 0.0001, "loss": 2.3525, "num_input_tokens_seen": 85551624032, "step": 151600 }, { "epoch": 3.331204227367924, "grad_norm": 0.09018707275390625, "learning_rate": 0.0001, "loss": 2.3397, "num_input_tokens_seen": 85604051680, "step": 151650 }, { "epoch": 3.332302530283987, "grad_norm": 0.09039066731929779, "learning_rate": 0.0001, "loss": 2.3529, "num_input_tokens_seen": 85656480480, "step": 151700 }, { "epoch": 3.3334008332000495, "grad_norm": 0.09161881357431412, "learning_rate": 0.0001, "loss": 2.3521, "num_input_tokens_seen": 85708908800, "step": 151750 }, { "epoch": 3.3344991361161127, "grad_norm": 0.09359107166528702, "learning_rate": 0.0001, "loss": 2.3535, "num_input_tokens_seen": 85761337600, "step": 151800 }, { "epoch": 3.3355974390321754, "grad_norm": 0.09596438705921173, "learning_rate": 0.0001, "loss": 2.3507, "num_input_tokens_seen": 85813766400, "step": 151850 }, { "epoch": 3.3366957419482386, "grad_norm": 0.09060530364513397, "learning_rate": 0.0001, "loss": 2.3488, "num_input_tokens_seen": 85866195200, "step": 151900 }, { "epoch": 3.3377940448643013, "grad_norm": 0.09371770173311234, "learning_rate": 0.0001, "loss": 2.3452, "num_input_tokens_seen": 85918618144, "step": 151950 }, { "epoch": 3.338892347780364, "grad_norm": 0.10104481130838394, "learning_rate": 0.0001, "loss": 2.3474, "num_input_tokens_seen": 85971046944, "step": 152000 }, { "epoch": 3.338892347780364, "eval_loss": 2.2608485221862793, "eval_runtime": 80.4976, "eval_samples_per_second": 62.114, "eval_steps_per_second": 15.528, "num_input_tokens_seen": 85971046944, "step": 152000 }, { "epoch": 3.339990650696427, "grad_norm": 0.10089096426963806, "learning_rate": 0.0001, "loss": 2.3468, "num_input_tokens_seen": 86023475744, "step": 152050 }, { "epoch": 3.34108895361249, "grad_norm": 0.093196339905262, "learning_rate": 0.0001, "loss": 2.351, "num_input_tokens_seen": 86075903552, "step": 152100 }, { "epoch": 3.3421872565285526, "grad_norm": 0.09717004001140594, "learning_rate": 0.0001, "loss": 2.3486, "num_input_tokens_seen": 86128331904, "step": 152150 }, { "epoch": 3.343285559444616, "grad_norm": 0.09837143868207932, "learning_rate": 0.0001, "loss": 2.3424, "num_input_tokens_seen": 86180760704, "step": 152200 }, { "epoch": 3.3443838623606785, "grad_norm": 0.08969301730394363, "learning_rate": 0.0001, "loss": 2.355, "num_input_tokens_seen": 86233189120, "step": 152250 }, { "epoch": 3.3454821652767412, "grad_norm": 0.09237448871135712, "learning_rate": 0.0001, "loss": 2.3472, "num_input_tokens_seen": 86285617920, "step": 152300 }, { "epoch": 3.3465804681928044, "grad_norm": 0.08787325024604797, "learning_rate": 0.0001, "loss": 2.3456, "num_input_tokens_seen": 86338046720, "step": 152350 }, { "epoch": 3.347678771108867, "grad_norm": 0.0857023298740387, "learning_rate": 0.0001, "loss": 2.3511, "num_input_tokens_seen": 86390472640, "step": 152400 }, { "epoch": 3.3487770740249303, "grad_norm": 0.09470528364181519, "learning_rate": 0.0001, "loss": 2.3481, "num_input_tokens_seen": 86442901440, "step": 152450 }, { "epoch": 3.349875376940993, "grad_norm": 0.08560249954462051, "learning_rate": 0.0001, "loss": 2.3419, "num_input_tokens_seen": 86495325760, "step": 152500 }, { "epoch": 3.349875376940993, "eval_loss": 2.260103225708008, "eval_runtime": 79.9152, "eval_samples_per_second": 62.566, "eval_steps_per_second": 15.642, "num_input_tokens_seen": 86495325760, "step": 152500 }, { "epoch": 3.3509736798570557, "grad_norm": 0.1033218502998352, "learning_rate": 0.0001, "loss": 2.3473, "num_input_tokens_seen": 86547749728, "step": 152550 }, { "epoch": 3.352071982773119, "grad_norm": 0.08745966106653214, "learning_rate": 0.0001, "loss": 2.3524, "num_input_tokens_seen": 86600178048, "step": 152600 }, { "epoch": 3.3531702856891816, "grad_norm": 0.08843880146741867, "learning_rate": 0.0001, "loss": 2.3467, "num_input_tokens_seen": 86652604992, "step": 152650 }, { "epoch": 3.3542685886052444, "grad_norm": 0.09221280366182327, "learning_rate": 0.0001, "loss": 2.3472, "num_input_tokens_seen": 86705033792, "step": 152700 }, { "epoch": 3.3553668915213075, "grad_norm": 0.09814682602882385, "learning_rate": 0.0001, "loss": 2.3482, "num_input_tokens_seen": 86757462592, "step": 152750 }, { "epoch": 3.3564651944373702, "grad_norm": 0.09335148334503174, "learning_rate": 0.0001, "loss": 2.3454, "num_input_tokens_seen": 86809891392, "step": 152800 }, { "epoch": 3.357563497353433, "grad_norm": 0.09705245494842529, "learning_rate": 0.0001, "loss": 2.351, "num_input_tokens_seen": 86862320192, "step": 152850 }, { "epoch": 3.358661800269496, "grad_norm": 0.09022082388401031, "learning_rate": 0.0001, "loss": 2.3474, "num_input_tokens_seen": 86914748672, "step": 152900 }, { "epoch": 3.359760103185559, "grad_norm": 0.09682248532772064, "learning_rate": 0.0001, "loss": 2.3424, "num_input_tokens_seen": 86967177472, "step": 152950 }, { "epoch": 3.360858406101622, "grad_norm": 0.092010997235775, "learning_rate": 0.0001, "loss": 2.3463, "num_input_tokens_seen": 87019603744, "step": 153000 }, { "epoch": 3.360858406101622, "eval_loss": 2.2600295543670654, "eval_runtime": 80.7842, "eval_samples_per_second": 61.893, "eval_steps_per_second": 15.473, "num_input_tokens_seen": 87019603744, "step": 153000 }, { "epoch": 3.3619567090176847, "grad_norm": 0.09880559146404266, "learning_rate": 0.0001, "loss": 2.3507, "num_input_tokens_seen": 87072028928, "step": 153050 }, { "epoch": 3.3630550119337475, "grad_norm": 0.09557171911001205, "learning_rate": 0.0001, "loss": 2.3501, "num_input_tokens_seen": 87124457728, "step": 153100 }, { "epoch": 3.3641533148498106, "grad_norm": 0.09662795066833496, "learning_rate": 0.0001, "loss": 2.3499, "num_input_tokens_seen": 87176886528, "step": 153150 }, { "epoch": 3.3652516177658733, "grad_norm": 0.09444860368967056, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 87229314144, "step": 153200 }, { "epoch": 3.3663499206819365, "grad_norm": 0.09235711395740509, "learning_rate": 0.0001, "loss": 2.3414, "num_input_tokens_seen": 87281742944, "step": 153250 }, { "epoch": 3.3674482235979992, "grad_norm": 0.09402730315923691, "learning_rate": 0.0001, "loss": 2.3476, "num_input_tokens_seen": 87334170144, "step": 153300 }, { "epoch": 3.368546526514062, "grad_norm": 0.09448970854282379, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 87386598944, "step": 153350 }, { "epoch": 3.369644829430125, "grad_norm": 0.09693534672260284, "learning_rate": 0.0001, "loss": 2.3438, "num_input_tokens_seen": 87439027744, "step": 153400 }, { "epoch": 3.370743132346188, "grad_norm": 0.09623382240533829, "learning_rate": 0.0001, "loss": 2.3441, "num_input_tokens_seen": 87491455136, "step": 153450 }, { "epoch": 3.3718414352622506, "grad_norm": 0.09151625633239746, "learning_rate": 0.0001, "loss": 2.3437, "num_input_tokens_seen": 87543883936, "step": 153500 }, { "epoch": 3.3718414352622506, "eval_loss": 2.2598507404327393, "eval_runtime": 80.5747, "eval_samples_per_second": 62.054, "eval_steps_per_second": 15.514, "num_input_tokens_seen": 87543883936, "step": 153500 }, { "epoch": 3.3729397381783137, "grad_norm": 0.09348684549331665, "learning_rate": 0.0001, "loss": 2.352, "num_input_tokens_seen": 87596312736, "step": 153550 }, { "epoch": 3.3740380410943764, "grad_norm": 0.08888132125139236, "learning_rate": 0.0001, "loss": 2.3439, "num_input_tokens_seen": 87648741536, "step": 153600 }, { "epoch": 3.375136344010439, "grad_norm": 0.09428702294826508, "learning_rate": 0.0001, "loss": 2.3515, "num_input_tokens_seen": 87701167552, "step": 153650 }, { "epoch": 3.3762346469265023, "grad_norm": 0.0958220586180687, "learning_rate": 0.0001, "loss": 2.3509, "num_input_tokens_seen": 87753593472, "step": 153700 }, { "epoch": 3.377332949842565, "grad_norm": 0.0915323942899704, "learning_rate": 0.0001, "loss": 2.3424, "num_input_tokens_seen": 87806022112, "step": 153750 }, { "epoch": 3.378431252758628, "grad_norm": 0.09283678978681564, "learning_rate": 0.0001, "loss": 2.3523, "num_input_tokens_seen": 87858450912, "step": 153800 }, { "epoch": 3.379529555674691, "grad_norm": 0.09269370883703232, "learning_rate": 0.0001, "loss": 2.348, "num_input_tokens_seen": 87910879712, "step": 153850 }, { "epoch": 3.3806278585907537, "grad_norm": 0.09313447028398514, "learning_rate": 0.0001, "loss": 2.3479, "num_input_tokens_seen": 87963308512, "step": 153900 }, { "epoch": 3.381726161506817, "grad_norm": 0.09186484664678574, "learning_rate": 0.0001, "loss": 2.3438, "num_input_tokens_seen": 88015735712, "step": 153950 }, { "epoch": 3.3828244644228795, "grad_norm": 0.09128747135400772, "learning_rate": 0.0001, "loss": 2.3461, "num_input_tokens_seen": 88068164512, "step": 154000 }, { "epoch": 3.3828244644228795, "eval_loss": 2.259556293487549, "eval_runtime": 80.7318, "eval_samples_per_second": 61.933, "eval_steps_per_second": 15.483, "num_input_tokens_seen": 88068164512, "step": 154000 }, { "epoch": 3.3839227673389427, "grad_norm": 0.09020719677209854, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 88120592672, "step": 154050 }, { "epoch": 3.3850210702550054, "grad_norm": 0.09590700268745422, "learning_rate": 0.0001, "loss": 2.3511, "num_input_tokens_seen": 88173020768, "step": 154100 }, { "epoch": 3.386119373171068, "grad_norm": 0.09949789196252823, "learning_rate": 0.0001, "loss": 2.3442, "num_input_tokens_seen": 88225449568, "step": 154150 }, { "epoch": 3.387217676087131, "grad_norm": 0.08850202709436417, "learning_rate": 0.0001, "loss": 2.3491, "num_input_tokens_seen": 88277876544, "step": 154200 }, { "epoch": 3.388315979003194, "grad_norm": 0.09314405918121338, "learning_rate": 0.0001, "loss": 2.3502, "num_input_tokens_seen": 88330304416, "step": 154250 }, { "epoch": 3.3894142819192568, "grad_norm": 0.09926445782184601, "learning_rate": 0.0001, "loss": 2.345, "num_input_tokens_seen": 88382733216, "step": 154300 }, { "epoch": 3.39051258483532, "grad_norm": 0.09468773007392883, "learning_rate": 0.0001, "loss": 2.3429, "num_input_tokens_seen": 88435156832, "step": 154350 }, { "epoch": 3.3916108877513826, "grad_norm": 0.09661844372749329, "learning_rate": 0.0001, "loss": 2.3465, "num_input_tokens_seen": 88487585632, "step": 154400 }, { "epoch": 3.3927091906674454, "grad_norm": 0.09253469109535217, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 88540012544, "step": 154450 }, { "epoch": 3.3938074935835085, "grad_norm": 0.08921549469232559, "learning_rate": 0.0001, "loss": 2.3491, "num_input_tokens_seen": 88592441344, "step": 154500 }, { "epoch": 3.3938074935835085, "eval_loss": 2.259216785430908, "eval_runtime": 80.6571, "eval_samples_per_second": 61.991, "eval_steps_per_second": 15.498, "num_input_tokens_seen": 88592441344, "step": 154500 }, { "epoch": 3.3949057964995712, "grad_norm": 0.09509548544883728, "learning_rate": 0.0001, "loss": 2.3423, "num_input_tokens_seen": 88644870144, "step": 154550 }, { "epoch": 3.3960040994156344, "grad_norm": 0.0929824560880661, "learning_rate": 0.0001, "loss": 2.3429, "num_input_tokens_seen": 88697298944, "step": 154600 }, { "epoch": 3.397102402331697, "grad_norm": 0.09483112394809723, "learning_rate": 0.0001, "loss": 2.3451, "num_input_tokens_seen": 88749721920, "step": 154650 }, { "epoch": 3.39820070524776, "grad_norm": 0.09392872452735901, "learning_rate": 0.0001, "loss": 2.3437, "num_input_tokens_seen": 88802150720, "step": 154700 }, { "epoch": 3.399299008163823, "grad_norm": 0.09336952865123749, "learning_rate": 0.0001, "loss": 2.3414, "num_input_tokens_seen": 88854579520, "step": 154750 }, { "epoch": 3.4003973110798857, "grad_norm": 0.0942087471485138, "learning_rate": 0.0001, "loss": 2.3497, "num_input_tokens_seen": 88907008320, "step": 154800 }, { "epoch": 3.4014956139959485, "grad_norm": 0.10055429488420486, "learning_rate": 0.0001, "loss": 2.3472, "num_input_tokens_seen": 88959437120, "step": 154850 }, { "epoch": 3.4025939169120116, "grad_norm": 0.10026364773511887, "learning_rate": 0.0001, "loss": 2.3524, "num_input_tokens_seen": 89011865920, "step": 154900 }, { "epoch": 3.4036922198280744, "grad_norm": 0.09667252004146576, "learning_rate": 0.0001, "loss": 2.3448, "num_input_tokens_seen": 89064293312, "step": 154950 }, { "epoch": 3.404790522744137, "grad_norm": 3.4680259227752686, "learning_rate": 0.0001, "loss": 2.3532, "num_input_tokens_seen": 89116722112, "step": 155000 }, { "epoch": 3.404790522744137, "eval_loss": 2.2594308853149414, "eval_runtime": 80.9279, "eval_samples_per_second": 61.783, "eval_steps_per_second": 15.446, "num_input_tokens_seen": 89116722112, "step": 155000 }, { "epoch": 3.4058888256602002, "grad_norm": 0.09823120385408401, "learning_rate": 0.0001, "loss": 2.3564, "num_input_tokens_seen": 89169150912, "step": 155050 }, { "epoch": 3.406987128576263, "grad_norm": 0.08918890357017517, "learning_rate": 0.0001, "loss": 2.351, "num_input_tokens_seen": 89221579712, "step": 155100 }, { "epoch": 3.408085431492326, "grad_norm": 0.09893437474966049, "learning_rate": 0.0001, "loss": 2.3513, "num_input_tokens_seen": 89274002944, "step": 155150 }, { "epoch": 3.409183734408389, "grad_norm": 0.09413613379001617, "learning_rate": 0.0001, "loss": 2.3456, "num_input_tokens_seen": 89326431744, "step": 155200 }, { "epoch": 3.4102820373244516, "grad_norm": 0.09172441065311432, "learning_rate": 0.0001, "loss": 2.3463, "num_input_tokens_seen": 89378858112, "step": 155250 }, { "epoch": 3.4113803402405147, "grad_norm": 0.08989844471216202, "learning_rate": 0.0001, "loss": 2.3568, "num_input_tokens_seen": 89431286912, "step": 155300 }, { "epoch": 3.4124786431565775, "grad_norm": 0.09628511965274811, "learning_rate": 0.0001, "loss": 2.3417, "num_input_tokens_seen": 89483711232, "step": 155350 }, { "epoch": 3.4135769460726406, "grad_norm": 0.0964779257774353, "learning_rate": 0.0001, "loss": 2.3461, "num_input_tokens_seen": 89536140032, "step": 155400 }, { "epoch": 3.4146752489887033, "grad_norm": 0.08790320158004761, "learning_rate": 0.0001, "loss": 2.3512, "num_input_tokens_seen": 89588568832, "step": 155450 }, { "epoch": 3.415773551904766, "grad_norm": 0.0895400196313858, "learning_rate": 0.0001, "loss": 2.3422, "num_input_tokens_seen": 89640995520, "step": 155500 }, { "epoch": 3.415773551904766, "eval_loss": 2.258901596069336, "eval_runtime": 80.3232, "eval_samples_per_second": 62.249, "eval_steps_per_second": 15.562, "num_input_tokens_seen": 89640995520, "step": 155500 }, { "epoch": 3.4168718548208292, "grad_norm": 0.10451982915401459, "learning_rate": 0.0001, "loss": 2.343, "num_input_tokens_seen": 89693424320, "step": 155550 }, { "epoch": 3.417970157736892, "grad_norm": 0.09347954392433167, "learning_rate": 0.0001, "loss": 2.3498, "num_input_tokens_seen": 89745849280, "step": 155600 }, { "epoch": 3.4190684606529547, "grad_norm": 0.08768779039382935, "learning_rate": 0.0001, "loss": 2.3496, "num_input_tokens_seen": 89798278080, "step": 155650 }, { "epoch": 3.420166763569018, "grad_norm": 0.09618473798036575, "learning_rate": 0.0001, "loss": 2.3431, "num_input_tokens_seen": 89850704384, "step": 155700 }, { "epoch": 3.4212650664850806, "grad_norm": 0.09236596524715424, "learning_rate": 0.0001, "loss": 2.3442, "num_input_tokens_seen": 89903133184, "step": 155750 }, { "epoch": 3.4223633694011433, "grad_norm": 0.08978936821222305, "learning_rate": 0.0001, "loss": 2.3427, "num_input_tokens_seen": 89955561984, "step": 155800 }, { "epoch": 3.4234616723172064, "grad_norm": 0.09113803505897522, "learning_rate": 0.0001, "loss": 2.3487, "num_input_tokens_seen": 90007990784, "step": 155850 }, { "epoch": 3.424559975233269, "grad_norm": 0.09217377752065659, "learning_rate": 0.0001, "loss": 2.3469, "num_input_tokens_seen": 90060419584, "step": 155900 }, { "epoch": 3.4256582781493323, "grad_norm": 0.09746365249156952, "learning_rate": 0.0001, "loss": 2.3464, "num_input_tokens_seen": 90112844480, "step": 155950 }, { "epoch": 3.426756581065395, "grad_norm": 0.09867499768733978, "learning_rate": 0.0001, "loss": 2.3493, "num_input_tokens_seen": 90165273280, "step": 156000 }, { "epoch": 3.426756581065395, "eval_loss": 2.2586677074432373, "eval_runtime": 80.2547, "eval_samples_per_second": 62.302, "eval_steps_per_second": 15.575, "num_input_tokens_seen": 90165273280, "step": 156000 }, { "epoch": 3.4278548839814578, "grad_norm": 0.09176173806190491, "learning_rate": 0.0001, "loss": 2.3456, "num_input_tokens_seen": 90217696288, "step": 156050 }, { "epoch": 3.428953186897521, "grad_norm": 0.089874766767025, "learning_rate": 0.0001, "loss": 2.3475, "num_input_tokens_seen": 90270125088, "step": 156100 }, { "epoch": 3.4300514898135837, "grad_norm": 0.0928313210606575, "learning_rate": 0.0001, "loss": 2.3515, "num_input_tokens_seen": 90322553888, "step": 156150 }, { "epoch": 3.4311497927296464, "grad_norm": 0.09114927798509598, "learning_rate": 0.0001, "loss": 2.3494, "num_input_tokens_seen": 90374980192, "step": 156200 }, { "epoch": 3.4322480956457095, "grad_norm": 0.09503049403429031, "learning_rate": 0.0001, "loss": 2.3439, "num_input_tokens_seen": 90427408992, "step": 156250 }, { "epoch": 3.4333463985617723, "grad_norm": 0.09301070868968964, "learning_rate": 0.0001, "loss": 2.3487, "num_input_tokens_seen": 90479837120, "step": 156300 }, { "epoch": 3.434444701477835, "grad_norm": 0.09190023690462112, "learning_rate": 0.0001, "loss": 2.3448, "num_input_tokens_seen": 90532258144, "step": 156350 }, { "epoch": 3.435543004393898, "grad_norm": 0.09636224061250687, "learning_rate": 0.0001, "loss": 2.3456, "num_input_tokens_seen": 90584686944, "step": 156400 }, { "epoch": 3.436641307309961, "grad_norm": 0.09875821322202682, "learning_rate": 0.0001, "loss": 2.3415, "num_input_tokens_seen": 90637115744, "step": 156450 }, { "epoch": 3.437739610226024, "grad_norm": 0.09387224912643433, "learning_rate": 0.0001, "loss": 2.3483, "num_input_tokens_seen": 90689540064, "step": 156500 }, { "epoch": 3.437739610226024, "eval_loss": 2.2581753730773926, "eval_runtime": 79.6955, "eval_samples_per_second": 62.739, "eval_steps_per_second": 15.685, "num_input_tokens_seen": 90689540064, "step": 156500 }, { "epoch": 3.4388379131420868, "grad_norm": 0.08944698423147202, "learning_rate": 0.0001, "loss": 2.3459, "num_input_tokens_seen": 90741968864, "step": 156550 }, { "epoch": 3.4399362160581495, "grad_norm": 0.09725566953420639, "learning_rate": 0.0001, "loss": 2.3399, "num_input_tokens_seen": 90794397664, "step": 156600 }, { "epoch": 3.4410345189742126, "grad_norm": 0.09932785481214523, "learning_rate": 0.0001, "loss": 2.3475, "num_input_tokens_seen": 90846826464, "step": 156650 }, { "epoch": 3.4421328218902754, "grad_norm": 0.09854361414909363, "learning_rate": 0.0001, "loss": 2.3449, "num_input_tokens_seen": 90899255264, "step": 156700 }, { "epoch": 3.4432311248063385, "grad_norm": 0.09402545541524887, "learning_rate": 0.0001, "loss": 2.3434, "num_input_tokens_seen": 90951684064, "step": 156750 }, { "epoch": 3.4443294277224012, "grad_norm": 0.09715921431779861, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 91004112864, "step": 156800 }, { "epoch": 3.445427730638464, "grad_norm": 0.09590257704257965, "learning_rate": 0.0001, "loss": 2.348, "num_input_tokens_seen": 91056541664, "step": 156850 }, { "epoch": 3.446526033554527, "grad_norm": 0.10155434161424637, "learning_rate": 0.0001, "loss": 2.3403, "num_input_tokens_seen": 91108970464, "step": 156900 }, { "epoch": 3.44762433647059, "grad_norm": 0.09132086485624313, "learning_rate": 0.0001, "loss": 2.3569, "num_input_tokens_seen": 91161399264, "step": 156950 }, { "epoch": 3.4487226393866526, "grad_norm": 0.0917491465806961, "learning_rate": 0.0001, "loss": 2.3454, "num_input_tokens_seen": 91213822304, "step": 157000 }, { "epoch": 3.4487226393866526, "eval_loss": 2.2578930854797363, "eval_runtime": 80.5074, "eval_samples_per_second": 62.106, "eval_steps_per_second": 15.527, "num_input_tokens_seen": 91213822304, "step": 157000 }, { "epoch": 3.4498209423027157, "grad_norm": 0.09102839231491089, "learning_rate": 0.0001, "loss": 2.3407, "num_input_tokens_seen": 91266251104, "step": 157050 }, { "epoch": 3.4509192452187785, "grad_norm": 0.09813258051872253, "learning_rate": 0.0001, "loss": 2.3461, "num_input_tokens_seen": 91318679904, "step": 157100 }, { "epoch": 3.452017548134841, "grad_norm": 0.09532950073480606, "learning_rate": 0.0001, "loss": 2.3457, "num_input_tokens_seen": 91371108704, "step": 157150 }, { "epoch": 3.4531158510509044, "grad_norm": 0.10110923647880554, "learning_rate": 0.0001, "loss": 2.3426, "num_input_tokens_seen": 91423537504, "step": 157200 }, { "epoch": 3.454214153966967, "grad_norm": 0.09686494618654251, "learning_rate": 0.0001, "loss": 2.342, "num_input_tokens_seen": 91475959552, "step": 157250 }, { "epoch": 3.4553124568830302, "grad_norm": 0.09327523410320282, "learning_rate": 0.0001, "loss": 2.3477, "num_input_tokens_seen": 91528385536, "step": 157300 }, { "epoch": 3.456410759799093, "grad_norm": 0.10524465143680573, "learning_rate": 0.0001, "loss": 2.351, "num_input_tokens_seen": 91580812064, "step": 157350 }, { "epoch": 3.4575090627151557, "grad_norm": 0.08858100324869156, "learning_rate": 0.0001, "loss": 2.3443, "num_input_tokens_seen": 91633240864, "step": 157400 }, { "epoch": 3.458607365631219, "grad_norm": 0.0905861109495163, "learning_rate": 0.0001, "loss": 2.338, "num_input_tokens_seen": 91685669664, "step": 157450 }, { "epoch": 3.4597056685472816, "grad_norm": 0.0902877077460289, "learning_rate": 0.0001, "loss": 2.3447, "num_input_tokens_seen": 91738098464, "step": 157500 }, { "epoch": 3.4597056685472816, "eval_loss": 2.257868766784668, "eval_runtime": 80.702, "eval_samples_per_second": 61.956, "eval_steps_per_second": 15.489, "num_input_tokens_seen": 91738098464, "step": 157500 }, { "epoch": 3.4608039714633447, "grad_norm": 0.09488774091005325, "learning_rate": 0.0001, "loss": 2.348, "num_input_tokens_seen": 91790527264, "step": 157550 }, { "epoch": 3.4619022743794075, "grad_norm": 0.09437818825244904, "learning_rate": 0.0001, "loss": 2.3366, "num_input_tokens_seen": 91842955104, "step": 157600 }, { "epoch": 3.46300057729547, "grad_norm": 0.09216772764921188, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 91895383904, "step": 157650 }, { "epoch": 3.464098880211533, "grad_norm": 0.0893646627664566, "learning_rate": 0.0001, "loss": 2.3436, "num_input_tokens_seen": 91947812704, "step": 157700 }, { "epoch": 3.465197183127596, "grad_norm": 0.10555808991193771, "learning_rate": 0.0001, "loss": 2.3407, "num_input_tokens_seen": 92000241504, "step": 157750 }, { "epoch": 3.466295486043659, "grad_norm": 0.09263647347688675, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 92052667776, "step": 157800 }, { "epoch": 3.467393788959722, "grad_norm": 0.09790777415037155, "learning_rate": 0.0001, "loss": 2.3426, "num_input_tokens_seen": 92105094528, "step": 157850 }, { "epoch": 3.4684920918757847, "grad_norm": 0.0883532464504242, "learning_rate": 0.0001, "loss": 2.3453, "num_input_tokens_seen": 92157523328, "step": 157900 }, { "epoch": 3.4695903947918474, "grad_norm": 0.09379395842552185, "learning_rate": 0.0001, "loss": 2.3452, "num_input_tokens_seen": 92209949248, "step": 157950 }, { "epoch": 3.4706886977079106, "grad_norm": 0.09533659368753433, "learning_rate": 0.0001, "loss": 2.3454, "num_input_tokens_seen": 92262378048, "step": 158000 }, { "epoch": 3.4706886977079106, "eval_loss": 2.258249521255493, "eval_runtime": 80.6484, "eval_samples_per_second": 61.998, "eval_steps_per_second": 15.499, "num_input_tokens_seen": 92262378048, "step": 158000 }, { "epoch": 3.4717870006239733, "grad_norm": 0.09290427714586258, "learning_rate": 0.0001, "loss": 2.3419, "num_input_tokens_seen": 92314806848, "step": 158050 }, { "epoch": 3.4728853035400364, "grad_norm": 0.09033751487731934, "learning_rate": 0.0001, "loss": 2.344, "num_input_tokens_seen": 92367235648, "step": 158100 }, { "epoch": 3.473983606456099, "grad_norm": 0.0893242284655571, "learning_rate": 0.0001, "loss": 2.342, "num_input_tokens_seen": 92419664448, "step": 158150 }, { "epoch": 3.475081909372162, "grad_norm": 0.09877942502498627, "learning_rate": 0.0001, "loss": 2.3459, "num_input_tokens_seen": 92472093248, "step": 158200 }, { "epoch": 3.476180212288225, "grad_norm": 0.09732919931411743, "learning_rate": 0.0001, "loss": 2.3476, "num_input_tokens_seen": 92524522048, "step": 158250 }, { "epoch": 3.4772785152042878, "grad_norm": 0.08927954733371735, "learning_rate": 0.0001, "loss": 2.349, "num_input_tokens_seen": 92576950848, "step": 158300 }, { "epoch": 3.4783768181203505, "grad_norm": 0.09306230396032333, "learning_rate": 0.0001, "loss": 2.3396, "num_input_tokens_seen": 92629379648, "step": 158350 }, { "epoch": 3.4794751210364137, "grad_norm": 0.09522947669029236, "learning_rate": 0.0001, "loss": 2.3394, "num_input_tokens_seen": 92681808448, "step": 158400 }, { "epoch": 3.4805734239524764, "grad_norm": 0.09624941647052765, "learning_rate": 0.0001, "loss": 2.3475, "num_input_tokens_seen": 92734237248, "step": 158450 }, { "epoch": 3.481671726868539, "grad_norm": 0.09459653496742249, "learning_rate": 0.0001, "loss": 2.3431, "num_input_tokens_seen": 92786665216, "step": 158500 }, { "epoch": 3.481671726868539, "eval_loss": 2.257422685623169, "eval_runtime": 80.6973, "eval_samples_per_second": 61.96, "eval_steps_per_second": 15.49, "num_input_tokens_seen": 92786665216, "step": 158500 }, { "epoch": 3.4827700297846023, "grad_norm": 0.09564249962568283, "learning_rate": 0.0001, "loss": 2.341, "num_input_tokens_seen": 92839093024, "step": 158550 }, { "epoch": 3.483868332700665, "grad_norm": 0.10864699631929398, "learning_rate": 0.0001, "loss": 2.3405, "num_input_tokens_seen": 92891521824, "step": 158600 }, { "epoch": 3.484966635616728, "grad_norm": 0.09777586907148361, "learning_rate": 0.0001, "loss": 2.3445, "num_input_tokens_seen": 92943950624, "step": 158650 }, { "epoch": 3.486064938532791, "grad_norm": 0.09032690525054932, "learning_rate": 0.0001, "loss": 2.3423, "num_input_tokens_seen": 92996375712, "step": 158700 }, { "epoch": 3.4871632414488536, "grad_norm": 0.09027489274740219, "learning_rate": 0.0001, "loss": 2.3412, "num_input_tokens_seen": 93048803136, "step": 158750 }, { "epoch": 3.4882615443649168, "grad_norm": 0.09923077374696732, "learning_rate": 0.0001, "loss": 2.3455, "num_input_tokens_seen": 93101231936, "step": 158800 }, { "epoch": 3.4893598472809795, "grad_norm": 0.10047315806150436, "learning_rate": 0.0001, "loss": 2.3416, "num_input_tokens_seen": 93153660736, "step": 158850 }, { "epoch": 3.4904581501970426, "grad_norm": 0.0912187322974205, "learning_rate": 0.0001, "loss": 2.3437, "num_input_tokens_seen": 93206089536, "step": 158900 }, { "epoch": 3.4915564531131054, "grad_norm": 0.09997432678937912, "learning_rate": 0.0001, "loss": 2.341, "num_input_tokens_seen": 93258518336, "step": 158950 }, { "epoch": 3.492654756029168, "grad_norm": 0.09082050621509552, "learning_rate": 0.0001, "loss": 2.3338, "num_input_tokens_seen": 93310947136, "step": 159000 }, { "epoch": 3.492654756029168, "eval_loss": 2.2572686672210693, "eval_runtime": 80.2123, "eval_samples_per_second": 62.335, "eval_steps_per_second": 15.584, "num_input_tokens_seen": 93310947136, "step": 159000 } ], "logging_steps": 50, "max_steps": 200000, "num_input_tokens_seen": 93310947136, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.651433565139625e+20, "train_batch_size": 32, "trial_name": null, "trial_params": null }