{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.357279084551812, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06, "grad_norm": 1.6613423824310303, "learning_rate": 6.361323155216286e-07, "loss": 2.3639, "step": 25 }, { "epoch": 0.13, "grad_norm": 1.3924915790557861, "learning_rate": 1.2722646310432571e-06, "loss": 2.3559, "step": 50 }, { "epoch": 0.19, "grad_norm": 0.9411530494689941, "learning_rate": 1.908396946564886e-06, "loss": 2.3245, "step": 75 }, { "epoch": 0.25, "grad_norm": 0.7608889937400818, "learning_rate": 2.5445292620865143e-06, "loss": 2.2691, "step": 100 }, { "epoch": 0.32, "grad_norm": 0.8099371790885925, "learning_rate": 3.1806615776081427e-06, "loss": 2.2409, "step": 125 }, { "epoch": 0.38, "grad_norm": 0.6795283555984497, "learning_rate": 3.816793893129772e-06, "loss": 2.2435, "step": 150 }, { "epoch": 0.45, "grad_norm": 0.6041733622550964, "learning_rate": 4.4529262086514e-06, "loss": 2.2375, "step": 175 }, { "epoch": 0.51, "grad_norm": 0.644061803817749, "learning_rate": 5.0890585241730285e-06, "loss": 2.2012, "step": 200 }, { "epoch": 0.57, "grad_norm": 0.6609583497047424, "learning_rate": 5.725190839694656e-06, "loss": 2.213, "step": 225 }, { "epoch": 0.64, "grad_norm": 0.6314918994903564, "learning_rate": 6.3613231552162854e-06, "loss": 2.2088, "step": 250 }, { "epoch": 0.7, "grad_norm": 0.6217303276062012, "learning_rate": 6.997455470737914e-06, "loss": 2.1823, "step": 275 }, { "epoch": 0.76, "grad_norm": 0.624250054359436, "learning_rate": 7.633587786259543e-06, "loss": 2.1685, "step": 300 }, { "epoch": 0.83, "grad_norm": 0.692873477935791, "learning_rate": 8.26972010178117e-06, "loss": 2.166, "step": 325 }, { "epoch": 0.89, "grad_norm": 0.6286394000053406, "learning_rate": 8.9058524173028e-06, "loss": 2.1521, "step": 350 }, { "epoch": 0.95, "grad_norm": 0.6370307803153992, "learning_rate": 9.54198473282443e-06, "loss": 2.149, "step": 375 }, { "epoch": 1.02, "grad_norm": 0.614319384098053, "learning_rate": 9.999903358354628e-06, "loss": 2.1586, "step": 400 }, { "epoch": 1.08, "grad_norm": 0.6239405274391174, "learning_rate": 9.997980516292023e-06, "loss": 2.1352, "step": 425 }, { "epoch": 1.14, "grad_norm": 0.6648218631744385, "learning_rate": 9.99359341519765e-06, "loss": 2.1352, "step": 450 }, { "epoch": 1.21, "grad_norm": 0.6202364563941956, "learning_rate": 9.986744218135864e-06, "loss": 2.1187, "step": 475 }, { "epoch": 1.27, "grad_norm": 0.6447356939315796, "learning_rate": 9.977436302109771e-06, "loss": 2.1135, "step": 500 }, { "epoch": 1.27, "eval_loss": 1.9783179759979248, "eval_runtime": 2.6728, "eval_samples_per_second": 187.071, "eval_steps_per_second": 23.571, "step": 500 }, { "epoch": 1.34, "grad_norm": 0.6953230500221252, "learning_rate": 9.96567425639619e-06, "loss": 2.1071, "step": 525 }, { "epoch": 1.4, "grad_norm": 0.6353166103363037, "learning_rate": 9.951463880282912e-06, "loss": 2.109, "step": 550 }, { "epoch": 1.46, "grad_norm": 0.5800075531005859, "learning_rate": 9.93481218020935e-06, "loss": 2.1165, "step": 575 }, { "epoch": 1.53, "grad_norm": 0.6457290053367615, "learning_rate": 9.915727366312012e-06, "loss": 2.1009, "step": 600 }, { "epoch": 1.59, "grad_norm": 0.5942001938819885, "learning_rate": 9.894218848376482e-06, "loss": 2.1133, "step": 625 }, { "epoch": 1.65, "grad_norm": 0.6016191244125366, "learning_rate": 9.870297231197922e-06, "loss": 2.111, "step": 650 }, { "epoch": 1.72, "grad_norm": 0.6793413758277893, "learning_rate": 9.843974309352356e-06, "loss": 2.0791, "step": 675 }, { "epoch": 1.78, "grad_norm": 0.6286611557006836, "learning_rate": 9.81526306138136e-06, "loss": 2.1128, "step": 700 }, { "epoch": 1.84, "grad_norm": 0.6072127223014832, "learning_rate": 9.784177643392958e-06, "loss": 2.0818, "step": 725 }, { "epoch": 1.91, "grad_norm": 0.581510066986084, "learning_rate": 9.750733382081965e-06, "loss": 2.0686, "step": 750 }, { "epoch": 1.97, "grad_norm": 0.607868492603302, "learning_rate": 9.714946767173124e-06, "loss": 2.0733, "step": 775 }, { "epoch": 2.03, "grad_norm": 0.7183848023414612, "learning_rate": 9.676835443290842e-06, "loss": 2.0666, "step": 800 }, { "epoch": 2.1, "grad_norm": 0.6486113667488098, "learning_rate": 9.63641820125949e-06, "loss": 2.0623, "step": 825 }, { "epoch": 2.16, "grad_norm": 0.6377106308937073, "learning_rate": 9.593714968838568e-06, "loss": 2.0564, "step": 850 }, { "epoch": 2.23, "grad_norm": 0.5825337171554565, "learning_rate": 9.548746800897305e-06, "loss": 2.0636, "step": 875 }, { "epoch": 2.29, "grad_norm": 0.6640869975090027, "learning_rate": 9.501535869033537e-06, "loss": 2.0637, "step": 900 }, { "epoch": 2.35, "grad_norm": 0.604284405708313, "learning_rate": 9.452105450641974e-06, "loss": 2.063, "step": 925 }, { "epoch": 2.42, "grad_norm": 0.6545228362083435, "learning_rate": 9.400479917437267e-06, "loss": 2.0379, "step": 950 }, { "epoch": 2.48, "grad_norm": 0.6467187404632568, "learning_rate": 9.346684723437504e-06, "loss": 2.0654, "step": 975 }, { "epoch": 2.54, "grad_norm": 0.6103580594062805, "learning_rate": 9.290746392414084e-06, "loss": 2.0503, "step": 1000 }, { "epoch": 2.54, "eval_loss": 1.9211387634277344, "eval_runtime": 2.6351, "eval_samples_per_second": 189.749, "eval_steps_per_second": 23.908, "step": 1000 }, { "epoch": 2.61, "grad_norm": 0.6254684329032898, "learning_rate": 9.232692504814154e-06, "loss": 2.0655, "step": 1025 }, { "epoch": 2.67, "grad_norm": 0.6028435230255127, "learning_rate": 9.172551684162025e-06, "loss": 2.0678, "step": 1050 }, { "epoch": 2.73, "grad_norm": 0.609348714351654, "learning_rate": 9.110353582946341e-06, "loss": 2.0406, "step": 1075 }, { "epoch": 2.8, "grad_norm": 0.6535661816596985, "learning_rate": 9.046128867999867e-06, "loss": 2.072, "step": 1100 }, { "epoch": 2.86, "grad_norm": 0.6350586414337158, "learning_rate": 8.979909205379198e-06, "loss": 2.0436, "step": 1125 }, { "epoch": 2.92, "grad_norm": 0.6003224849700928, "learning_rate": 8.911727244751763e-06, "loss": 2.0428, "step": 1150 }, { "epoch": 2.99, "grad_norm": 0.6171026825904846, "learning_rate": 8.84161660329789e-06, "loss": 2.0466, "step": 1175 }, { "epoch": 3.05, "grad_norm": 0.6239070892333984, "learning_rate": 8.76961184913581e-06, "loss": 2.0201, "step": 1200 }, { "epoch": 3.12, "grad_norm": 0.6221436858177185, "learning_rate": 8.695748484277833e-06, "loss": 2.0281, "step": 1225 }, { "epoch": 3.18, "grad_norm": 0.6175958514213562, "learning_rate": 8.620062927126021e-06, "loss": 2.0472, "step": 1250 }, { "epoch": 3.24, "grad_norm": 0.6366366147994995, "learning_rate": 8.54259249451608e-06, "loss": 2.0326, "step": 1275 }, { "epoch": 3.31, "grad_norm": 0.6660250425338745, "learning_rate": 8.463375383318254e-06, "loss": 2.0263, "step": 1300 }, { "epoch": 3.37, "grad_norm": 0.5739914178848267, "learning_rate": 8.382450651604316e-06, "loss": 2.0265, "step": 1325 }, { "epoch": 3.43, "grad_norm": 0.6790344715118408, "learning_rate": 8.29985819938996e-06, "loss": 2.0219, "step": 1350 }, { "epoch": 3.5, "grad_norm": 0.6223481893539429, "learning_rate": 8.215638748962047e-06, "loss": 1.9994, "step": 1375 }, { "epoch": 3.56, "grad_norm": 0.5811251997947693, "learning_rate": 8.129833824800453e-06, "loss": 2.0206, "step": 1400 }, { "epoch": 3.62, "grad_norm": 0.5943218469619751, "learning_rate": 8.042485733104382e-06, "loss": 2.0131, "step": 1425 }, { "epoch": 3.69, "grad_norm": 0.595011293888092, "learning_rate": 7.953637540933252e-06, "loss": 2.0231, "step": 1450 }, { "epoch": 3.75, "grad_norm": 0.7110486030578613, "learning_rate": 7.863333054972443e-06, "loss": 2.0297, "step": 1475 }, { "epoch": 3.81, "grad_norm": 0.6503390073776245, "learning_rate": 7.771616799934372e-06, "loss": 2.0163, "step": 1500 }, { "epoch": 3.81, "eval_loss": 1.8906679153442383, "eval_runtime": 2.6486, "eval_samples_per_second": 188.781, "eval_steps_per_second": 23.786, "step": 1500 }, { "epoch": 3.88, "grad_norm": 0.6919596791267395, "learning_rate": 7.67853399660553e-06, "loss": 2.0236, "step": 1525 }, { "epoch": 3.94, "grad_norm": 0.6393699645996094, "learning_rate": 7.584130539550348e-06, "loss": 2.0241, "step": 1550 }, { "epoch": 4.01, "grad_norm": 0.6010494232177734, "learning_rate": 7.488452974482818e-06, "loss": 2.0123, "step": 1575 }, { "epoch": 4.07, "grad_norm": 0.6905403733253479, "learning_rate": 7.3915484753171055e-06, "loss": 2.0073, "step": 1600 }, { "epoch": 4.13, "grad_norm": 0.6041144728660583, "learning_rate": 7.293464820908392e-06, "loss": 2.0118, "step": 1625 }, { "epoch": 4.2, "grad_norm": 0.6558405756950378, "learning_rate": 7.194250371495467e-06, "loss": 2.0059, "step": 1650 }, { "epoch": 4.26, "grad_norm": 0.6472019553184509, "learning_rate": 7.093954044856674e-06, "loss": 1.9895, "step": 1675 }, { "epoch": 4.32, "grad_norm": 0.6358299255371094, "learning_rate": 6.992625292190942e-06, "loss": 1.9934, "step": 1700 }, { "epoch": 4.39, "grad_norm": 0.828366219997406, "learning_rate": 6.89031407373584e-06, "loss": 2.0051, "step": 1725 }, { "epoch": 4.45, "grad_norm": 0.5889772176742554, "learning_rate": 6.787070834134618e-06, "loss": 2.0057, "step": 1750 }, { "epoch": 4.51, "grad_norm": 0.6443700790405273, "learning_rate": 6.682946477564438e-06, "loss": 1.9983, "step": 1775 }, { "epoch": 4.58, "grad_norm": 0.6555039286613464, "learning_rate": 6.57799234263802e-06, "loss": 2.0049, "step": 1800 }, { "epoch": 4.64, "grad_norm": 0.5760651230812073, "learning_rate": 6.47226017709109e-06, "loss": 1.9864, "step": 1825 }, { "epoch": 4.7, "grad_norm": 0.6189552545547485, "learning_rate": 6.365802112268104e-06, "loss": 2.0114, "step": 1850 }, { "epoch": 4.77, "grad_norm": 0.5847841501235962, "learning_rate": 6.258670637418851e-06, "loss": 1.9923, "step": 1875 }, { "epoch": 4.83, "grad_norm": 0.5979297757148743, "learning_rate": 6.150918573818569e-06, "loss": 1.9992, "step": 1900 }, { "epoch": 4.9, "grad_norm": 0.6612520813941956, "learning_rate": 6.042599048724366e-06, "loss": 2.0062, "step": 1925 }, { "epoch": 4.96, "grad_norm": 0.6389756202697754, "learning_rate": 5.933765469180779e-06, "loss": 1.9897, "step": 1950 }, { "epoch": 5.02, "grad_norm": 0.5569688081741333, "learning_rate": 5.82447149568738e-06, "loss": 1.9913, "step": 1975 }, { "epoch": 5.09, "grad_norm": 0.6268473267555237, "learning_rate": 5.714771015741414e-06, "loss": 1.9881, "step": 2000 }, { "epoch": 5.09, "eval_loss": 1.872865915298462, "eval_runtime": 2.6614, "eval_samples_per_second": 187.873, "eval_steps_per_second": 23.672, "step": 2000 }, { "epoch": 5.15, "grad_norm": 0.6172594428062439, "learning_rate": 5.604718117268515e-06, "loss": 1.9858, "step": 2025 }, { "epoch": 5.21, "grad_norm": 0.6328703165054321, "learning_rate": 5.494367061954609e-06, "loss": 1.9934, "step": 2050 }, { "epoch": 5.28, "grad_norm": 0.6563747525215149, "learning_rate": 5.383772258492135e-06, "loss": 1.9751, "step": 2075 }, { "epoch": 5.34, "grad_norm": 0.5623390078544617, "learning_rate": 5.2729882357537864e-06, "loss": 1.9911, "step": 2100 }, { "epoch": 5.4, "grad_norm": 0.5955666303634644, "learning_rate": 5.162069615906998e-06, "loss": 1.9852, "step": 2125 }, { "epoch": 5.47, "grad_norm": 0.6157717108726501, "learning_rate": 5.051071087482442e-06, "loss": 1.967, "step": 2150 }, { "epoch": 5.53, "grad_norm": 0.7150459289550781, "learning_rate": 4.940047378409786e-06, "loss": 1.9798, "step": 2175 }, { "epoch": 5.59, "grad_norm": 0.5714321732521057, "learning_rate": 4.829053229034043e-06, "loss": 1.9864, "step": 2200 }, { "epoch": 5.66, "grad_norm": 0.5759787559509277, "learning_rate": 4.718143365125784e-06, "loss": 1.9802, "step": 2225 }, { "epoch": 5.72, "grad_norm": 0.6573282480239868, "learning_rate": 4.6073724708985575e-06, "loss": 1.9851, "step": 2250 }, { "epoch": 5.79, "grad_norm": 0.6285906434059143, "learning_rate": 4.496795162046774e-06, "loss": 1.9794, "step": 2275 }, { "epoch": 5.85, "grad_norm": 0.645057201385498, "learning_rate": 4.386465958817396e-06, "loss": 1.9776, "step": 2300 }, { "epoch": 5.91, "grad_norm": 0.6082957983016968, "learning_rate": 4.276439259128667e-06, "loss": 1.987, "step": 2325 }, { "epoch": 5.98, "grad_norm": 0.6126915216445923, "learning_rate": 4.1667693117491784e-06, "loss": 1.9837, "step": 2350 }, { "epoch": 6.04, "grad_norm": 0.5728232860565186, "learning_rate": 4.057510189550456e-06, "loss": 1.9822, "step": 2375 }, { "epoch": 6.1, "grad_norm": 0.6158954501152039, "learning_rate": 3.9487157628462784e-06, "loss": 1.9709, "step": 2400 }, { "epoch": 6.17, "grad_norm": 0.5914610028266907, "learning_rate": 3.840439672831872e-06, "loss": 1.9841, "step": 2425 }, { "epoch": 6.23, "grad_norm": 0.6070579290390015, "learning_rate": 3.7327353051360703e-06, "loss": 1.965, "step": 2450 }, { "epoch": 6.29, "grad_norm": 0.6792969107627869, "learning_rate": 3.625655763499467e-06, "loss": 1.9792, "step": 2475 }, { "epoch": 6.36, "grad_norm": 0.5990138053894043, "learning_rate": 3.5192538435915834e-06, "loss": 1.9739, "step": 2500 }, { "epoch": 6.36, "eval_loss": 1.862921118736267, "eval_runtime": 2.6134, "eval_samples_per_second": 191.32, "eval_steps_per_second": 24.106, "step": 2500 } ], "logging_steps": 25, "max_steps": 3930, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }