| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.357279084551812, |
| "eval_steps": 500, |
| "global_step": 2500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.06, |
| "grad_norm": 1.6613423824310303, |
| "learning_rate": 6.361323155216286e-07, |
| "loss": 2.3639, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 1.3924915790557861, |
| "learning_rate": 1.2722646310432571e-06, |
| "loss": 2.3559, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.19, |
| "grad_norm": 0.9411530494689941, |
| "learning_rate": 1.908396946564886e-06, |
| "loss": 2.3245, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.7608889937400818, |
| "learning_rate": 2.5445292620865143e-06, |
| "loss": 2.2691, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.8099371790885925, |
| "learning_rate": 3.1806615776081427e-06, |
| "loss": 2.2409, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.6795283555984497, |
| "learning_rate": 3.816793893129772e-06, |
| "loss": 2.2435, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.6041733622550964, |
| "learning_rate": 4.4529262086514e-06, |
| "loss": 2.2375, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 0.644061803817749, |
| "learning_rate": 5.0890585241730285e-06, |
| "loss": 2.2012, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.57, |
| "grad_norm": 0.6609583497047424, |
| "learning_rate": 5.725190839694656e-06, |
| "loss": 2.213, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 0.6314918994903564, |
| "learning_rate": 6.3613231552162854e-06, |
| "loss": 2.2088, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.6217303276062012, |
| "learning_rate": 6.997455470737914e-06, |
| "loss": 2.1823, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 0.624250054359436, |
| "learning_rate": 7.633587786259543e-06, |
| "loss": 2.1685, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.83, |
| "grad_norm": 0.692873477935791, |
| "learning_rate": 8.26972010178117e-06, |
| "loss": 2.166, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.89, |
| "grad_norm": 0.6286394000053406, |
| "learning_rate": 8.9058524173028e-06, |
| "loss": 2.1521, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.6370307803153992, |
| "learning_rate": 9.54198473282443e-06, |
| "loss": 2.149, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.02, |
| "grad_norm": 0.614319384098053, |
| "learning_rate": 9.999903358354628e-06, |
| "loss": 2.1586, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.08, |
| "grad_norm": 0.6239405274391174, |
| "learning_rate": 9.997980516292023e-06, |
| "loss": 2.1352, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.14, |
| "grad_norm": 0.6648218631744385, |
| "learning_rate": 9.99359341519765e-06, |
| "loss": 2.1352, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.21, |
| "grad_norm": 0.6202364563941956, |
| "learning_rate": 9.986744218135864e-06, |
| "loss": 2.1187, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 0.6447356939315796, |
| "learning_rate": 9.977436302109771e-06, |
| "loss": 2.1135, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.27, |
| "eval_loss": 1.9783179759979248, |
| "eval_runtime": 2.6728, |
| "eval_samples_per_second": 187.071, |
| "eval_steps_per_second": 23.571, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.34, |
| "grad_norm": 0.6953230500221252, |
| "learning_rate": 9.96567425639619e-06, |
| "loss": 2.1071, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.6353166103363037, |
| "learning_rate": 9.951463880282912e-06, |
| "loss": 2.109, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.46, |
| "grad_norm": 0.5800075531005859, |
| "learning_rate": 9.93481218020935e-06, |
| "loss": 2.1165, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 0.6457290053367615, |
| "learning_rate": 9.915727366312012e-06, |
| "loss": 2.1009, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.59, |
| "grad_norm": 0.5942001938819885, |
| "learning_rate": 9.894218848376482e-06, |
| "loss": 2.1133, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.6016191244125366, |
| "learning_rate": 9.870297231197922e-06, |
| "loss": 2.111, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.72, |
| "grad_norm": 0.6793413758277893, |
| "learning_rate": 9.843974309352356e-06, |
| "loss": 2.0791, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.78, |
| "grad_norm": 0.6286611557006836, |
| "learning_rate": 9.81526306138136e-06, |
| "loss": 2.1128, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.84, |
| "grad_norm": 0.6072127223014832, |
| "learning_rate": 9.784177643392958e-06, |
| "loss": 2.0818, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.91, |
| "grad_norm": 0.581510066986084, |
| "learning_rate": 9.750733382081965e-06, |
| "loss": 2.0686, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.97, |
| "grad_norm": 0.607868492603302, |
| "learning_rate": 9.714946767173124e-06, |
| "loss": 2.0733, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.03, |
| "grad_norm": 0.7183848023414612, |
| "learning_rate": 9.676835443290842e-06, |
| "loss": 2.0666, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.1, |
| "grad_norm": 0.6486113667488098, |
| "learning_rate": 9.63641820125949e-06, |
| "loss": 2.0623, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.16, |
| "grad_norm": 0.6377106308937073, |
| "learning_rate": 9.593714968838568e-06, |
| "loss": 2.0564, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.23, |
| "grad_norm": 0.5825337171554565, |
| "learning_rate": 9.548746800897305e-06, |
| "loss": 2.0636, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.29, |
| "grad_norm": 0.6640869975090027, |
| "learning_rate": 9.501535869033537e-06, |
| "loss": 2.0637, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.35, |
| "grad_norm": 0.604284405708313, |
| "learning_rate": 9.452105450641974e-06, |
| "loss": 2.063, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.42, |
| "grad_norm": 0.6545228362083435, |
| "learning_rate": 9.400479917437267e-06, |
| "loss": 2.0379, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.48, |
| "grad_norm": 0.6467187404632568, |
| "learning_rate": 9.346684723437504e-06, |
| "loss": 2.0654, |
| "step": 975 |
| }, |
| { |
| "epoch": 2.54, |
| "grad_norm": 0.6103580594062805, |
| "learning_rate": 9.290746392414084e-06, |
| "loss": 2.0503, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.54, |
| "eval_loss": 1.9211387634277344, |
| "eval_runtime": 2.6351, |
| "eval_samples_per_second": 189.749, |
| "eval_steps_per_second": 23.908, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.61, |
| "grad_norm": 0.6254684329032898, |
| "learning_rate": 9.232692504814154e-06, |
| "loss": 2.0655, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.67, |
| "grad_norm": 0.6028435230255127, |
| "learning_rate": 9.172551684162025e-06, |
| "loss": 2.0678, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.73, |
| "grad_norm": 0.609348714351654, |
| "learning_rate": 9.110353582946341e-06, |
| "loss": 2.0406, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.6535661816596985, |
| "learning_rate": 9.046128867999867e-06, |
| "loss": 2.072, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.86, |
| "grad_norm": 0.6350586414337158, |
| "learning_rate": 8.979909205379198e-06, |
| "loss": 2.0436, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.92, |
| "grad_norm": 0.6003224849700928, |
| "learning_rate": 8.911727244751763e-06, |
| "loss": 2.0428, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.99, |
| "grad_norm": 0.6171026825904846, |
| "learning_rate": 8.84161660329789e-06, |
| "loss": 2.0466, |
| "step": 1175 |
| }, |
| { |
| "epoch": 3.05, |
| "grad_norm": 0.6239070892333984, |
| "learning_rate": 8.76961184913581e-06, |
| "loss": 2.0201, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.12, |
| "grad_norm": 0.6221436858177185, |
| "learning_rate": 8.695748484277833e-06, |
| "loss": 2.0281, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.18, |
| "grad_norm": 0.6175958514213562, |
| "learning_rate": 8.620062927126021e-06, |
| "loss": 2.0472, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.24, |
| "grad_norm": 0.6366366147994995, |
| "learning_rate": 8.54259249451608e-06, |
| "loss": 2.0326, |
| "step": 1275 |
| }, |
| { |
| "epoch": 3.31, |
| "grad_norm": 0.6660250425338745, |
| "learning_rate": 8.463375383318254e-06, |
| "loss": 2.0263, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.37, |
| "grad_norm": 0.5739914178848267, |
| "learning_rate": 8.382450651604316e-06, |
| "loss": 2.0265, |
| "step": 1325 |
| }, |
| { |
| "epoch": 3.43, |
| "grad_norm": 0.6790344715118408, |
| "learning_rate": 8.29985819938996e-06, |
| "loss": 2.0219, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.5, |
| "grad_norm": 0.6223481893539429, |
| "learning_rate": 8.215638748962047e-06, |
| "loss": 1.9994, |
| "step": 1375 |
| }, |
| { |
| "epoch": 3.56, |
| "grad_norm": 0.5811251997947693, |
| "learning_rate": 8.129833824800453e-06, |
| "loss": 2.0206, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.62, |
| "grad_norm": 0.5943218469619751, |
| "learning_rate": 8.042485733104382e-06, |
| "loss": 2.0131, |
| "step": 1425 |
| }, |
| { |
| "epoch": 3.69, |
| "grad_norm": 0.595011293888092, |
| "learning_rate": 7.953637540933252e-06, |
| "loss": 2.0231, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.7110486030578613, |
| "learning_rate": 7.863333054972443e-06, |
| "loss": 2.0297, |
| "step": 1475 |
| }, |
| { |
| "epoch": 3.81, |
| "grad_norm": 0.6503390073776245, |
| "learning_rate": 7.771616799934372e-06, |
| "loss": 2.0163, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.81, |
| "eval_loss": 1.8906679153442383, |
| "eval_runtime": 2.6486, |
| "eval_samples_per_second": 188.781, |
| "eval_steps_per_second": 23.786, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.88, |
| "grad_norm": 0.6919596791267395, |
| "learning_rate": 7.67853399660553e-06, |
| "loss": 2.0236, |
| "step": 1525 |
| }, |
| { |
| "epoch": 3.94, |
| "grad_norm": 0.6393699645996094, |
| "learning_rate": 7.584130539550348e-06, |
| "loss": 2.0241, |
| "step": 1550 |
| }, |
| { |
| "epoch": 4.01, |
| "grad_norm": 0.6010494232177734, |
| "learning_rate": 7.488452974482818e-06, |
| "loss": 2.0123, |
| "step": 1575 |
| }, |
| { |
| "epoch": 4.07, |
| "grad_norm": 0.6905403733253479, |
| "learning_rate": 7.3915484753171055e-06, |
| "loss": 2.0073, |
| "step": 1600 |
| }, |
| { |
| "epoch": 4.13, |
| "grad_norm": 0.6041144728660583, |
| "learning_rate": 7.293464820908392e-06, |
| "loss": 2.0118, |
| "step": 1625 |
| }, |
| { |
| "epoch": 4.2, |
| "grad_norm": 0.6558405756950378, |
| "learning_rate": 7.194250371495467e-06, |
| "loss": 2.0059, |
| "step": 1650 |
| }, |
| { |
| "epoch": 4.26, |
| "grad_norm": 0.6472019553184509, |
| "learning_rate": 7.093954044856674e-06, |
| "loss": 1.9895, |
| "step": 1675 |
| }, |
| { |
| "epoch": 4.32, |
| "grad_norm": 0.6358299255371094, |
| "learning_rate": 6.992625292190942e-06, |
| "loss": 1.9934, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.39, |
| "grad_norm": 0.828366219997406, |
| "learning_rate": 6.89031407373584e-06, |
| "loss": 2.0051, |
| "step": 1725 |
| }, |
| { |
| "epoch": 4.45, |
| "grad_norm": 0.5889772176742554, |
| "learning_rate": 6.787070834134618e-06, |
| "loss": 2.0057, |
| "step": 1750 |
| }, |
| { |
| "epoch": 4.51, |
| "grad_norm": 0.6443700790405273, |
| "learning_rate": 6.682946477564438e-06, |
| "loss": 1.9983, |
| "step": 1775 |
| }, |
| { |
| "epoch": 4.58, |
| "grad_norm": 0.6555039286613464, |
| "learning_rate": 6.57799234263802e-06, |
| "loss": 2.0049, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.64, |
| "grad_norm": 0.5760651230812073, |
| "learning_rate": 6.47226017709109e-06, |
| "loss": 1.9864, |
| "step": 1825 |
| }, |
| { |
| "epoch": 4.7, |
| "grad_norm": 0.6189552545547485, |
| "learning_rate": 6.365802112268104e-06, |
| "loss": 2.0114, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.77, |
| "grad_norm": 0.5847841501235962, |
| "learning_rate": 6.258670637418851e-06, |
| "loss": 1.9923, |
| "step": 1875 |
| }, |
| { |
| "epoch": 4.83, |
| "grad_norm": 0.5979297757148743, |
| "learning_rate": 6.150918573818569e-06, |
| "loss": 1.9992, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.9, |
| "grad_norm": 0.6612520813941956, |
| "learning_rate": 6.042599048724366e-06, |
| "loss": 2.0062, |
| "step": 1925 |
| }, |
| { |
| "epoch": 4.96, |
| "grad_norm": 0.6389756202697754, |
| "learning_rate": 5.933765469180779e-06, |
| "loss": 1.9897, |
| "step": 1950 |
| }, |
| { |
| "epoch": 5.02, |
| "grad_norm": 0.5569688081741333, |
| "learning_rate": 5.82447149568738e-06, |
| "loss": 1.9913, |
| "step": 1975 |
| }, |
| { |
| "epoch": 5.09, |
| "grad_norm": 0.6268473267555237, |
| "learning_rate": 5.714771015741414e-06, |
| "loss": 1.9881, |
| "step": 2000 |
| }, |
| { |
| "epoch": 5.09, |
| "eval_loss": 1.872865915298462, |
| "eval_runtime": 2.6614, |
| "eval_samples_per_second": 187.873, |
| "eval_steps_per_second": 23.672, |
| "step": 2000 |
| }, |
| { |
| "epoch": 5.15, |
| "grad_norm": 0.6172594428062439, |
| "learning_rate": 5.604718117268515e-06, |
| "loss": 1.9858, |
| "step": 2025 |
| }, |
| { |
| "epoch": 5.21, |
| "grad_norm": 0.6328703165054321, |
| "learning_rate": 5.494367061954609e-06, |
| "loss": 1.9934, |
| "step": 2050 |
| }, |
| { |
| "epoch": 5.28, |
| "grad_norm": 0.6563747525215149, |
| "learning_rate": 5.383772258492135e-06, |
| "loss": 1.9751, |
| "step": 2075 |
| }, |
| { |
| "epoch": 5.34, |
| "grad_norm": 0.5623390078544617, |
| "learning_rate": 5.2729882357537864e-06, |
| "loss": 1.9911, |
| "step": 2100 |
| }, |
| { |
| "epoch": 5.4, |
| "grad_norm": 0.5955666303634644, |
| "learning_rate": 5.162069615906998e-06, |
| "loss": 1.9852, |
| "step": 2125 |
| }, |
| { |
| "epoch": 5.47, |
| "grad_norm": 0.6157717108726501, |
| "learning_rate": 5.051071087482442e-06, |
| "loss": 1.967, |
| "step": 2150 |
| }, |
| { |
| "epoch": 5.53, |
| "grad_norm": 0.7150459289550781, |
| "learning_rate": 4.940047378409786e-06, |
| "loss": 1.9798, |
| "step": 2175 |
| }, |
| { |
| "epoch": 5.59, |
| "grad_norm": 0.5714321732521057, |
| "learning_rate": 4.829053229034043e-06, |
| "loss": 1.9864, |
| "step": 2200 |
| }, |
| { |
| "epoch": 5.66, |
| "grad_norm": 0.5759787559509277, |
| "learning_rate": 4.718143365125784e-06, |
| "loss": 1.9802, |
| "step": 2225 |
| }, |
| { |
| "epoch": 5.72, |
| "grad_norm": 0.6573282480239868, |
| "learning_rate": 4.6073724708985575e-06, |
| "loss": 1.9851, |
| "step": 2250 |
| }, |
| { |
| "epoch": 5.79, |
| "grad_norm": 0.6285906434059143, |
| "learning_rate": 4.496795162046774e-06, |
| "loss": 1.9794, |
| "step": 2275 |
| }, |
| { |
| "epoch": 5.85, |
| "grad_norm": 0.645057201385498, |
| "learning_rate": 4.386465958817396e-06, |
| "loss": 1.9776, |
| "step": 2300 |
| }, |
| { |
| "epoch": 5.91, |
| "grad_norm": 0.6082957983016968, |
| "learning_rate": 4.276439259128667e-06, |
| "loss": 1.987, |
| "step": 2325 |
| }, |
| { |
| "epoch": 5.98, |
| "grad_norm": 0.6126915216445923, |
| "learning_rate": 4.1667693117491784e-06, |
| "loss": 1.9837, |
| "step": 2350 |
| }, |
| { |
| "epoch": 6.04, |
| "grad_norm": 0.5728232860565186, |
| "learning_rate": 4.057510189550456e-06, |
| "loss": 1.9822, |
| "step": 2375 |
| }, |
| { |
| "epoch": 6.1, |
| "grad_norm": 0.6158954501152039, |
| "learning_rate": 3.9487157628462784e-06, |
| "loss": 1.9709, |
| "step": 2400 |
| }, |
| { |
| "epoch": 6.17, |
| "grad_norm": 0.5914610028266907, |
| "learning_rate": 3.840439672831872e-06, |
| "loss": 1.9841, |
| "step": 2425 |
| }, |
| { |
| "epoch": 6.23, |
| "grad_norm": 0.6070579290390015, |
| "learning_rate": 3.7327353051360703e-06, |
| "loss": 1.965, |
| "step": 2450 |
| }, |
| { |
| "epoch": 6.29, |
| "grad_norm": 0.6792969107627869, |
| "learning_rate": 3.625655763499467e-06, |
| "loss": 1.9792, |
| "step": 2475 |
| }, |
| { |
| "epoch": 6.36, |
| "grad_norm": 0.5990138053894043, |
| "learning_rate": 3.5192538435915834e-06, |
| "loss": 1.9739, |
| "step": 2500 |
| }, |
| { |
| "epoch": 6.36, |
| "eval_loss": 1.862921118736267, |
| "eval_runtime": 2.6134, |
| "eval_samples_per_second": 191.32, |
| "eval_steps_per_second": 24.106, |
| "step": 2500 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 3930, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 500, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|