{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 11.734028683181226, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1303780964797914, "grad_norm": 2.199315071105957, "learning_rate": 1.9800000000000004e-05, "loss": 8.1376, "step": 100 }, { "epoch": 0.2607561929595828, "grad_norm": 1.6398614645004272, "learning_rate": 3.9800000000000005e-05, "loss": 7.2341, "step": 200 }, { "epoch": 0.39113428943937417, "grad_norm": 1.6936060190200806, "learning_rate": 5.9800000000000003e-05, "loss": 6.5612, "step": 300 }, { "epoch": 0.5215123859191656, "grad_norm": 2.0208520889282227, "learning_rate": 7.98e-05, "loss": 6.2796, "step": 400 }, { "epoch": 0.651890482398957, "grad_norm": 2.6271910667419434, "learning_rate": 9.98e-05, "loss": 6.0939, "step": 500 }, { "epoch": 0.7822685788787483, "grad_norm": 3.186422348022461, "learning_rate": 9.913913043478262e-05, "loss": 5.8614, "step": 600 }, { "epoch": 0.9126466753585397, "grad_norm": 2.3998301029205322, "learning_rate": 9.826956521739131e-05, "loss": 5.6819, "step": 700 }, { "epoch": 1.0430247718383312, "grad_norm": 2.232499837875366, "learning_rate": 9.74e-05, "loss": 5.568, "step": 800 }, { "epoch": 1.1734028683181226, "grad_norm": 3.5190041065216064, "learning_rate": 9.65304347826087e-05, "loss": 5.4076, "step": 900 }, { "epoch": 1.303780964797914, "grad_norm": 3.0163352489471436, "learning_rate": 9.566086956521739e-05, "loss": 5.3031, "step": 1000 }, { "epoch": 1.4341590612777053, "grad_norm": 2.686789035797119, "learning_rate": 9.47913043478261e-05, "loss": 5.1789, "step": 1100 }, { "epoch": 1.5645371577574967, "grad_norm": 2.6344940662384033, "learning_rate": 9.392173913043478e-05, "loss": 5.0844, "step": 1200 }, { "epoch": 1.694915254237288, "grad_norm": 3.4910781383514404, "learning_rate": 9.305217391304349e-05, "loss": 4.9929, "step": 1300 }, { "epoch": 1.8252933507170797, "grad_norm": 3.5532472133636475, "learning_rate": 9.218260869565217e-05, "loss": 4.8998, "step": 1400 }, { "epoch": 1.9556714471968708, "grad_norm": 3.246469497680664, "learning_rate": 9.131304347826088e-05, "loss": 4.7656, "step": 1500 }, { "epoch": 2.0860495436766624, "grad_norm": 3.618093729019165, "learning_rate": 9.044347826086958e-05, "loss": 4.5793, "step": 1600 }, { "epoch": 2.2164276401564535, "grad_norm": 4.264451026916504, "learning_rate": 8.957391304347826e-05, "loss": 4.4459, "step": 1700 }, { "epoch": 2.346805736636245, "grad_norm": 4.198596477508545, "learning_rate": 8.870434782608697e-05, "loss": 4.3207, "step": 1800 }, { "epoch": 2.4771838331160367, "grad_norm": 4.508902549743652, "learning_rate": 8.783478260869565e-05, "loss": 4.235, "step": 1900 }, { "epoch": 2.607561929595828, "grad_norm": 4.511523723602295, "learning_rate": 8.696521739130436e-05, "loss": 4.1349, "step": 2000 }, { "epoch": 2.737940026075619, "grad_norm": 3.903628349304199, "learning_rate": 8.609565217391304e-05, "loss": 4.0358, "step": 2100 }, { "epoch": 2.8683181225554106, "grad_norm": 4.530259609222412, "learning_rate": 8.523478260869565e-05, "loss": 3.9168, "step": 2200 }, { "epoch": 2.9986962190352022, "grad_norm": 4.071807861328125, "learning_rate": 8.436521739130435e-05, "loss": 3.8121, "step": 2300 }, { "epoch": 3.1290743155149934, "grad_norm": 4.118851661682129, "learning_rate": 8.349565217391306e-05, "loss": 3.6089, "step": 2400 }, { "epoch": 3.259452411994785, "grad_norm": 4.954625606536865, "learning_rate": 8.262608695652174e-05, "loss": 3.5162, "step": 2500 }, { "epoch": 3.389830508474576, "grad_norm": 4.958930492401123, "learning_rate": 8.175652173913044e-05, "loss": 3.427, "step": 2600 }, { "epoch": 3.5202086049543677, "grad_norm": 4.683836460113525, "learning_rate": 8.088695652173913e-05, "loss": 3.3498, "step": 2700 }, { "epoch": 3.6505867014341593, "grad_norm": 5.2733259201049805, "learning_rate": 8.001739130434783e-05, "loss": 3.2661, "step": 2800 }, { "epoch": 3.7809647979139505, "grad_norm": 5.038583278656006, "learning_rate": 7.914782608695652e-05, "loss": 3.1859, "step": 2900 }, { "epoch": 3.9113428943937416, "grad_norm": 5.139358043670654, "learning_rate": 7.827826086956522e-05, "loss": 3.125, "step": 3000 }, { "epoch": 4.041720990873533, "grad_norm": 5.076649188995361, "learning_rate": 7.740869565217391e-05, "loss": 2.9774, "step": 3100 }, { "epoch": 4.172099087353325, "grad_norm": 7.2738165855407715, "learning_rate": 7.653913043478261e-05, "loss": 2.8133, "step": 3200 }, { "epoch": 4.302477183833116, "grad_norm": 4.856014728546143, "learning_rate": 7.56695652173913e-05, "loss": 2.7578, "step": 3300 }, { "epoch": 4.432855280312907, "grad_norm": 4.625192165374756, "learning_rate": 7.48e-05, "loss": 2.706, "step": 3400 }, { "epoch": 4.563233376792699, "grad_norm": 5.898316860198975, "learning_rate": 7.39304347826087e-05, "loss": 2.6505, "step": 3500 }, { "epoch": 4.69361147327249, "grad_norm": 5.6338324546813965, "learning_rate": 7.30608695652174e-05, "loss": 2.5716, "step": 3600 }, { "epoch": 4.823989569752282, "grad_norm": 4.824554443359375, "learning_rate": 7.219130434782609e-05, "loss": 2.4845, "step": 3700 }, { "epoch": 4.9543676662320735, "grad_norm": 6.3988037109375, "learning_rate": 7.132173913043479e-05, "loss": 2.4531, "step": 3800 }, { "epoch": 5.084745762711864, "grad_norm": 4.477128982543945, "learning_rate": 7.045217391304348e-05, "loss": 2.3122, "step": 3900 }, { "epoch": 5.215123859191656, "grad_norm": 5.06989049911499, "learning_rate": 6.958260869565218e-05, "loss": 2.2015, "step": 4000 }, { "epoch": 5.345501955671447, "grad_norm": 4.576299667358398, "learning_rate": 6.871304347826087e-05, "loss": 2.1651, "step": 4100 }, { "epoch": 5.475880052151239, "grad_norm": 4.288216590881348, "learning_rate": 6.785217391304349e-05, "loss": 2.1132, "step": 4200 }, { "epoch": 5.60625814863103, "grad_norm": 6.912166118621826, "learning_rate": 6.698260869565218e-05, "loss": 2.0712, "step": 4300 }, { "epoch": 5.736636245110821, "grad_norm": 5.21943998336792, "learning_rate": 6.611304347826088e-05, "loss": 2.0318, "step": 4400 }, { "epoch": 5.867014341590613, "grad_norm": 6.814871311187744, "learning_rate": 6.524347826086957e-05, "loss": 1.996, "step": 4500 }, { "epoch": 5.9973924380704045, "grad_norm": 6.987309455871582, "learning_rate": 6.437391304347827e-05, "loss": 1.9435, "step": 4600 }, { "epoch": 6.127770534550195, "grad_norm": 6.552413463592529, "learning_rate": 6.350434782608696e-05, "loss": 1.7934, "step": 4700 }, { "epoch": 6.258148631029987, "grad_norm": 4.565166473388672, "learning_rate": 6.263478260869565e-05, "loss": 1.7635, "step": 4800 }, { "epoch": 6.388526727509778, "grad_norm": 6.433986186981201, "learning_rate": 6.176521739130436e-05, "loss": 1.7403, "step": 4900 }, { "epoch": 6.51890482398957, "grad_norm": 4.71969747543335, "learning_rate": 6.0895652173913044e-05, "loss": 1.7153, "step": 5000 }, { "epoch": 6.6492829204693615, "grad_norm": 5.415853023529053, "learning_rate": 6.002608695652174e-05, "loss": 1.6563, "step": 5100 }, { "epoch": 6.779661016949152, "grad_norm": 4.742384910583496, "learning_rate": 5.9156521739130436e-05, "loss": 1.6431, "step": 5200 }, { "epoch": 6.910039113428944, "grad_norm": 4.704698085784912, "learning_rate": 5.828695652173913e-05, "loss": 1.6071, "step": 5300 }, { "epoch": 7.040417209908735, "grad_norm": 4.532726764678955, "learning_rate": 5.7417391304347834e-05, "loss": 1.561, "step": 5400 }, { "epoch": 7.170795306388527, "grad_norm": 4.465181827545166, "learning_rate": 5.654782608695652e-05, "loss": 1.4572, "step": 5500 }, { "epoch": 7.301173402868318, "grad_norm": 5.061060428619385, "learning_rate": 5.568695652173913e-05, "loss": 1.4313, "step": 5600 }, { "epoch": 7.431551499348109, "grad_norm": 4.221780300140381, "learning_rate": 5.481739130434783e-05, "loss": 1.4099, "step": 5700 }, { "epoch": 7.561929595827901, "grad_norm": 4.012993812561035, "learning_rate": 5.394782608695652e-05, "loss": 1.4044, "step": 5800 }, { "epoch": 7.6923076923076925, "grad_norm": 5.935675621032715, "learning_rate": 5.307826086956522e-05, "loss": 1.3888, "step": 5900 }, { "epoch": 7.822685788787483, "grad_norm": 5.962758541107178, "learning_rate": 5.220869565217391e-05, "loss": 1.3509, "step": 6000 }, { "epoch": 7.953063885267275, "grad_norm": 5.295892238616943, "learning_rate": 5.1339130434782615e-05, "loss": 1.3376, "step": 6100 }, { "epoch": 8.083441981747066, "grad_norm": 4.8437347412109375, "learning_rate": 5.0469565217391304e-05, "loss": 1.2682, "step": 6200 }, { "epoch": 8.213820078226858, "grad_norm": 6.373332500457764, "learning_rate": 4.96e-05, "loss": 1.2201, "step": 6300 }, { "epoch": 8.34419817470665, "grad_norm": 4.604928493499756, "learning_rate": 4.8730434782608695e-05, "loss": 1.2068, "step": 6400 }, { "epoch": 8.474576271186441, "grad_norm": 6.433839797973633, "learning_rate": 4.786086956521739e-05, "loss": 1.197, "step": 6500 }, { "epoch": 8.604954367666233, "grad_norm": 3.5552725791931152, "learning_rate": 4.699130434782609e-05, "loss": 1.1726, "step": 6600 }, { "epoch": 8.735332464146023, "grad_norm": 8.83135986328125, "learning_rate": 4.612173913043478e-05, "loss": 1.1484, "step": 6700 }, { "epoch": 8.865710560625814, "grad_norm": 4.828709602355957, "learning_rate": 4.5252173913043485e-05, "loss": 1.1583, "step": 6800 }, { "epoch": 8.996088657105606, "grad_norm": 4.650803089141846, "learning_rate": 4.438260869565218e-05, "loss": 1.1481, "step": 6900 }, { "epoch": 9.126466753585397, "grad_norm": 3.8928608894348145, "learning_rate": 4.3513043478260876e-05, "loss": 1.0397, "step": 7000 }, { "epoch": 9.256844850065189, "grad_norm": 3.941554307937622, "learning_rate": 4.2643478260869565e-05, "loss": 1.0332, "step": 7100 }, { "epoch": 9.38722294654498, "grad_norm": 5.031588077545166, "learning_rate": 4.177391304347826e-05, "loss": 1.0444, "step": 7200 }, { "epoch": 9.517601043024772, "grad_norm": 5.682355880737305, "learning_rate": 4.090434782608696e-05, "loss": 1.0123, "step": 7300 }, { "epoch": 9.647979139504564, "grad_norm": 4.108279705047607, "learning_rate": 4.003478260869565e-05, "loss": 1.0194, "step": 7400 }, { "epoch": 9.778357235984355, "grad_norm": 3.3550074100494385, "learning_rate": 3.916521739130435e-05, "loss": 1.007, "step": 7500 }, { "epoch": 9.908735332464147, "grad_norm": 4.076298713684082, "learning_rate": 3.8295652173913044e-05, "loss": 0.9973, "step": 7600 }, { "epoch": 10.039113428943937, "grad_norm": 4.534595012664795, "learning_rate": 3.742608695652174e-05, "loss": 0.9646, "step": 7700 }, { "epoch": 10.169491525423728, "grad_norm": 3.5908210277557373, "learning_rate": 3.6556521739130435e-05, "loss": 0.8984, "step": 7800 }, { "epoch": 10.29986962190352, "grad_norm": 4.6488213539123535, "learning_rate": 3.568695652173913e-05, "loss": 0.8942, "step": 7900 }, { "epoch": 10.430247718383312, "grad_norm": 4.129245758056641, "learning_rate": 3.481739130434783e-05, "loss": 0.8929, "step": 8000 }, { "epoch": 10.560625814863103, "grad_norm": 3.645109176635742, "learning_rate": 3.394782608695652e-05, "loss": 0.8892, "step": 8100 }, { "epoch": 10.691003911342895, "grad_norm": 4.7509565353393555, "learning_rate": 3.307826086956522e-05, "loss": 0.8937, "step": 8200 }, { "epoch": 10.821382007822686, "grad_norm": 4.302902698516846, "learning_rate": 3.2208695652173914e-05, "loss": 0.8836, "step": 8300 }, { "epoch": 10.951760104302478, "grad_norm": 4.266739845275879, "learning_rate": 3.133913043478261e-05, "loss": 0.8875, "step": 8400 }, { "epoch": 11.082138200782268, "grad_norm": 3.6782920360565186, "learning_rate": 3.0469565217391305e-05, "loss": 0.8411, "step": 8500 }, { "epoch": 11.21251629726206, "grad_norm": 3.9318060874938965, "learning_rate": 2.96e-05, "loss": 0.8063, "step": 8600 }, { "epoch": 11.342894393741851, "grad_norm": 4.048793315887451, "learning_rate": 2.8730434782608694e-05, "loss": 0.7968, "step": 8700 }, { "epoch": 11.473272490221643, "grad_norm": 4.4102888107299805, "learning_rate": 2.786086956521739e-05, "loss": 0.7983, "step": 8800 }, { "epoch": 11.603650586701434, "grad_norm": 4.744139671325684, "learning_rate": 2.6991304347826092e-05, "loss": 0.7903, "step": 8900 }, { "epoch": 11.734028683181226, "grad_norm": 4.147999286651611, "learning_rate": 2.6121739130434788e-05, "loss": 0.7997, "step": 9000 } ], "logging_steps": 100, "max_steps": 12000, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.7976884011008e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }