{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 32740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15271838729383017, "grad_norm": 18.494943618774414, "learning_rate": 1.9694563225412342e-05, "loss": 0.5789, "step": 500 }, { "epoch": 0.30543677458766033, "grad_norm": 8.591017723083496, "learning_rate": 1.938912645082468e-05, "loss": 0.4508, "step": 1000 }, { "epoch": 0.4581551618814905, "grad_norm": 8.066777229309082, "learning_rate": 1.9083689676237022e-05, "loss": 0.4167, "step": 1500 }, { "epoch": 0.6108735491753207, "grad_norm": 11.628117561340332, "learning_rate": 1.8778252901649362e-05, "loss": 0.4007, "step": 2000 }, { "epoch": 0.7635919364691509, "grad_norm": 6.132204055786133, "learning_rate": 1.84728161270617e-05, "loss": 0.3831, "step": 2500 }, { "epoch": 0.916310323762981, "grad_norm": 6.5475640296936035, "learning_rate": 1.816737935247404e-05, "loss": 0.3605, "step": 3000 }, { "epoch": 1.0690287110568113, "grad_norm": 4.849888801574707, "learning_rate": 1.786194257788638e-05, "loss": 0.3139, "step": 3500 }, { "epoch": 1.2217470983506413, "grad_norm": 12.288202285766602, "learning_rate": 1.755650580329872e-05, "loss": 0.2494, "step": 4000 }, { "epoch": 1.3744654856444716, "grad_norm": 19.027870178222656, "learning_rate": 1.725106902871106e-05, "loss": 0.2561, "step": 4500 }, { "epoch": 1.5271838729383018, "grad_norm": 12.914681434631348, "learning_rate": 1.69456322541234e-05, "loss": 0.2464, "step": 5000 }, { "epoch": 1.679902260232132, "grad_norm": 8.226604461669922, "learning_rate": 1.6640195479535736e-05, "loss": 0.2599, "step": 5500 }, { "epoch": 1.832620647525962, "grad_norm": 14.606645584106445, "learning_rate": 1.6334758704948076e-05, "loss": 0.2608, "step": 6000 }, { "epoch": 1.9853390348197923, "grad_norm": 15.511763572692871, "learning_rate": 1.6029321930360416e-05, "loss": 0.2541, "step": 6500 }, { "epoch": 2.1380574221136226, "grad_norm": 16.74727439880371, "learning_rate": 1.5723885155772757e-05, "loss": 0.1556, "step": 7000 }, { "epoch": 2.2907758094074526, "grad_norm": 20.485782623291016, "learning_rate": 1.5418448381185097e-05, "loss": 0.153, "step": 7500 }, { "epoch": 2.4434941967012827, "grad_norm": 7.16798734664917, "learning_rate": 1.5113011606597437e-05, "loss": 0.1579, "step": 8000 }, { "epoch": 2.596212583995113, "grad_norm": 7.562324523925781, "learning_rate": 1.4807574832009775e-05, "loss": 0.1616, "step": 8500 }, { "epoch": 2.748930971288943, "grad_norm": 16.156944274902344, "learning_rate": 1.4502138057422115e-05, "loss": 0.1658, "step": 9000 }, { "epoch": 2.901649358582773, "grad_norm": 13.200675964355469, "learning_rate": 1.4196701282834456e-05, "loss": 0.1657, "step": 9500 }, { "epoch": 3.0543677458766036, "grad_norm": 1.1726553440093994, "learning_rate": 1.3891264508246794e-05, "loss": 0.1469, "step": 10000 }, { "epoch": 3.2070861331704337, "grad_norm": 30.926372528076172, "learning_rate": 1.3585827733659134e-05, "loss": 0.1103, "step": 10500 }, { "epoch": 3.359804520464264, "grad_norm": 11.862848281860352, "learning_rate": 1.3280390959071474e-05, "loss": 0.1011, "step": 11000 }, { "epoch": 3.512522907758094, "grad_norm": 7.604775428771973, "learning_rate": 1.2974954184483813e-05, "loss": 0.1035, "step": 11500 }, { "epoch": 3.665241295051924, "grad_norm": 1.4278388023376465, "learning_rate": 1.2669517409896153e-05, "loss": 0.1145, "step": 12000 }, { "epoch": 3.8179596823457542, "grad_norm": 0.45902055501937866, "learning_rate": 1.2364080635308493e-05, "loss": 0.1138, "step": 12500 }, { "epoch": 3.9706780696395847, "grad_norm": 31.397523880004883, "learning_rate": 1.2058643860720831e-05, "loss": 0.1101, "step": 13000 }, { "epoch": 4.123396456933415, "grad_norm": 0.23616579174995422, "learning_rate": 1.1753207086133171e-05, "loss": 0.0771, "step": 13500 }, { "epoch": 4.276114844227245, "grad_norm": 0.22658583521842957, "learning_rate": 1.1447770311545512e-05, "loss": 0.0693, "step": 14000 }, { "epoch": 4.428833231521075, "grad_norm": 0.35704877972602844, "learning_rate": 1.114233353695785e-05, "loss": 0.0751, "step": 14500 }, { "epoch": 4.581551618814905, "grad_norm": 0.17207038402557373, "learning_rate": 1.083689676237019e-05, "loss": 0.0699, "step": 15000 }, { "epoch": 4.734270006108735, "grad_norm": 8.590784072875977, "learning_rate": 1.053145998778253e-05, "loss": 0.0692, "step": 15500 }, { "epoch": 4.886988393402565, "grad_norm": 25.48099708557129, "learning_rate": 1.0226023213194869e-05, "loss": 0.0727, "step": 16000 }, { "epoch": 5.039706780696396, "grad_norm": 4.0271501541137695, "learning_rate": 9.920586438607209e-06, "loss": 0.0659, "step": 16500 }, { "epoch": 5.192425167990226, "grad_norm": 1.6429097652435303, "learning_rate": 9.615149664019549e-06, "loss": 0.0508, "step": 17000 }, { "epoch": 5.345143555284056, "grad_norm": 0.03841910511255264, "learning_rate": 9.309712889431889e-06, "loss": 0.0462, "step": 17500 }, { "epoch": 5.497861942577886, "grad_norm": 0.03901192545890808, "learning_rate": 9.004276114844227e-06, "loss": 0.0508, "step": 18000 }, { "epoch": 5.650580329871716, "grad_norm": 24.19868278503418, "learning_rate": 8.698839340256568e-06, "loss": 0.0454, "step": 18500 }, { "epoch": 5.803298717165546, "grad_norm": 12.015921592712402, "learning_rate": 8.393402565668908e-06, "loss": 0.0525, "step": 19000 }, { "epoch": 5.956017104459377, "grad_norm": 0.1184128075838089, "learning_rate": 8.087965791081248e-06, "loss": 0.0524, "step": 19500 }, { "epoch": 6.108735491753207, "grad_norm": 53.9754524230957, "learning_rate": 7.782529016493586e-06, "loss": 0.0431, "step": 20000 }, { "epoch": 6.261453879047037, "grad_norm": 0.049675118178129196, "learning_rate": 7.4770922419059255e-06, "loss": 0.0311, "step": 20500 }, { "epoch": 6.414172266340867, "grad_norm": 0.12765900790691376, "learning_rate": 7.171655467318266e-06, "loss": 0.0303, "step": 21000 }, { "epoch": 6.566890653634697, "grad_norm": 0.04678593948483467, "learning_rate": 6.866218692730605e-06, "loss": 0.0315, "step": 21500 }, { "epoch": 6.719609040928528, "grad_norm": 33.2860221862793, "learning_rate": 6.560781918142944e-06, "loss": 0.0342, "step": 22000 }, { "epoch": 6.872327428222358, "grad_norm": 0.022564252838492393, "learning_rate": 6.255345143555285e-06, "loss": 0.0325, "step": 22500 }, { "epoch": 7.025045815516188, "grad_norm": 48.42949676513672, "learning_rate": 5.949908368967624e-06, "loss": 0.0281, "step": 23000 }, { "epoch": 7.177764202810018, "grad_norm": 0.1493069976568222, "learning_rate": 5.644471594379963e-06, "loss": 0.0255, "step": 23500 }, { "epoch": 7.330482590103848, "grad_norm": 0.02931222692131996, "learning_rate": 5.339034819792304e-06, "loss": 0.0218, "step": 24000 }, { "epoch": 7.483200977397678, "grad_norm": 99.97526550292969, "learning_rate": 5.033598045204643e-06, "loss": 0.0234, "step": 24500 }, { "epoch": 7.6359193646915084, "grad_norm": 1.244328260421753, "learning_rate": 4.728161270616982e-06, "loss": 0.0248, "step": 25000 }, { "epoch": 7.788637751985339, "grad_norm": 0.01413074042648077, "learning_rate": 4.4227244960293225e-06, "loss": 0.02, "step": 25500 }, { "epoch": 7.941356139279169, "grad_norm": 0.007764923386275768, "learning_rate": 4.117287721441662e-06, "loss": 0.0242, "step": 26000 }, { "epoch": 8.094074526573, "grad_norm": 0.04147506132721901, "learning_rate": 3.8118509468540015e-06, "loss": 0.0175, "step": 26500 }, { "epoch": 8.24679291386683, "grad_norm": 41.88145446777344, "learning_rate": 3.506414172266341e-06, "loss": 0.0177, "step": 27000 }, { "epoch": 8.39951130116066, "grad_norm": 0.005077702924609184, "learning_rate": 3.2009773976786805e-06, "loss": 0.019, "step": 27500 }, { "epoch": 8.55222968845449, "grad_norm": 87.5268325805664, "learning_rate": 2.8955406230910206e-06, "loss": 0.013, "step": 28000 }, { "epoch": 8.70494807574832, "grad_norm": 0.03963172435760498, "learning_rate": 2.5901038485033603e-06, "loss": 0.0152, "step": 28500 }, { "epoch": 8.85766646304215, "grad_norm": 0.0064314561896026134, "learning_rate": 2.2846670739156996e-06, "loss": 0.0171, "step": 29000 }, { "epoch": 9.01038485033598, "grad_norm": 0.0032072309404611588, "learning_rate": 1.9792302993280393e-06, "loss": 0.0161, "step": 29500 }, { "epoch": 9.16310323762981, "grad_norm": 0.016286134719848633, "learning_rate": 1.6737935247403788e-06, "loss": 0.0117, "step": 30000 }, { "epoch": 9.315821624923641, "grad_norm": 0.04490550234913826, "learning_rate": 1.3683567501527185e-06, "loss": 0.0122, "step": 30500 }, { "epoch": 9.46854001221747, "grad_norm": 0.030366964638233185, "learning_rate": 1.0629199755650582e-06, "loss": 0.0076, "step": 31000 }, { "epoch": 9.621258399511301, "grad_norm": 0.026300914585590363, "learning_rate": 7.574832009773978e-07, "loss": 0.0104, "step": 31500 }, { "epoch": 9.77397678680513, "grad_norm": 0.0014681548345834017, "learning_rate": 4.5204642638973736e-07, "loss": 0.0107, "step": 32000 }, { "epoch": 9.926695174098962, "grad_norm": 0.0018271030858159065, "learning_rate": 1.4660965180207698e-07, "loss": 0.0128, "step": 32500 }, { "epoch": 10.0, "step": 32740, "total_flos": 7.032286628473344e+16, "train_loss": 0.11521797929512122, "train_runtime": 5803.3368, "train_samples_per_second": 180.488, "train_steps_per_second": 5.642 } ], "logging_steps": 500, "max_steps": 32740, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.032286628473344e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }