{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9911012235817576, "eval_steps": 500, "global_step": 672, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04449388209121246, "grad_norm": 17.919046764867105, "learning_rate": 5e-06, "loss": 0.7727, "step": 10 }, { "epoch": 0.08898776418242492, "grad_norm": 2.5068176722694706, "learning_rate": 5e-06, "loss": 0.7067, "step": 20 }, { "epoch": 0.13348164627363737, "grad_norm": 0.8188606020387743, "learning_rate": 5e-06, "loss": 0.6732, "step": 30 }, { "epoch": 0.17797552836484984, "grad_norm": 0.7944470655707544, "learning_rate": 5e-06, "loss": 0.6514, "step": 40 }, { "epoch": 0.22246941045606228, "grad_norm": 0.9562761756923293, "learning_rate": 5e-06, "loss": 0.6433, "step": 50 }, { "epoch": 0.26696329254727474, "grad_norm": 0.8767792470489679, "learning_rate": 5e-06, "loss": 0.6295, "step": 60 }, { "epoch": 0.3114571746384872, "grad_norm": 1.337143723222155, "learning_rate": 5e-06, "loss": 0.615, "step": 70 }, { "epoch": 0.3559510567296997, "grad_norm": 0.5940615588218296, "learning_rate": 5e-06, "loss": 0.6111, "step": 80 }, { "epoch": 0.40044493882091214, "grad_norm": 0.6245704159321135, "learning_rate": 5e-06, "loss": 0.6, "step": 90 }, { "epoch": 0.44493882091212456, "grad_norm": 0.5683316859611832, "learning_rate": 5e-06, "loss": 0.6087, "step": 100 }, { "epoch": 0.489432703003337, "grad_norm": 0.5159069244539627, "learning_rate": 5e-06, "loss": 0.5983, "step": 110 }, { "epoch": 0.5339265850945495, "grad_norm": 0.5072121076395835, "learning_rate": 5e-06, "loss": 0.6023, "step": 120 }, { "epoch": 0.578420467185762, "grad_norm": 0.5274934145200447, "learning_rate": 5e-06, "loss": 0.5919, "step": 130 }, { "epoch": 0.6229143492769744, "grad_norm": 0.5544559368876572, "learning_rate": 5e-06, "loss": 0.5974, "step": 140 }, { "epoch": 0.6674082313681868, "grad_norm": 0.4929894123721837, "learning_rate": 5e-06, "loss": 0.5945, "step": 150 }, { "epoch": 0.7119021134593994, "grad_norm": 1.1362385626723797, "learning_rate": 5e-06, "loss": 0.5892, "step": 160 }, { "epoch": 0.7563959955506118, "grad_norm": 0.4793988319315333, "learning_rate": 5e-06, "loss": 0.5856, "step": 170 }, { "epoch": 0.8008898776418243, "grad_norm": 0.5137409156397603, "learning_rate": 5e-06, "loss": 0.5879, "step": 180 }, { "epoch": 0.8453837597330367, "grad_norm": 0.7484432139988203, "learning_rate": 5e-06, "loss": 0.5897, "step": 190 }, { "epoch": 0.8898776418242491, "grad_norm": 0.4983308283425539, "learning_rate": 5e-06, "loss": 0.5754, "step": 200 }, { "epoch": 0.9343715239154616, "grad_norm": 0.5985618062120786, "learning_rate": 5e-06, "loss": 0.5821, "step": 210 }, { "epoch": 0.978865406006674, "grad_norm": 0.4660686039944073, "learning_rate": 5e-06, "loss": 0.581, "step": 220 }, { "epoch": 0.996662958843159, "eval_loss": 0.5854274034500122, "eval_runtime": 240.2197, "eval_samples_per_second": 25.21, "eval_steps_per_second": 0.395, "step": 224 }, { "epoch": 1.0239154616240267, "grad_norm": 0.7457465164497729, "learning_rate": 5e-06, "loss": 0.5763, "step": 230 }, { "epoch": 1.068409343715239, "grad_norm": 0.6326576381402257, "learning_rate": 5e-06, "loss": 0.5449, "step": 240 }, { "epoch": 1.1129032258064515, "grad_norm": 0.6388396264873892, "learning_rate": 5e-06, "loss": 0.5301, "step": 250 }, { "epoch": 1.1573971078976641, "grad_norm": 0.752126204770771, "learning_rate": 5e-06, "loss": 0.5374, "step": 260 }, { "epoch": 1.2018909899888766, "grad_norm": 0.4673289459919312, "learning_rate": 5e-06, "loss": 0.5314, "step": 270 }, { "epoch": 1.246384872080089, "grad_norm": 0.5707602520384042, "learning_rate": 5e-06, "loss": 0.5371, "step": 280 }, { "epoch": 1.2908787541713014, "grad_norm": 0.7449098291403021, "learning_rate": 5e-06, "loss": 0.5393, "step": 290 }, { "epoch": 1.3353726362625138, "grad_norm": 0.6185110765439527, "learning_rate": 5e-06, "loss": 0.5361, "step": 300 }, { "epoch": 1.3798665183537264, "grad_norm": 0.6947624284326104, "learning_rate": 5e-06, "loss": 0.5353, "step": 310 }, { "epoch": 1.4243604004449388, "grad_norm": 0.5200108023651202, "learning_rate": 5e-06, "loss": 0.5316, "step": 320 }, { "epoch": 1.4688542825361512, "grad_norm": 0.47510706811194214, "learning_rate": 5e-06, "loss": 0.5352, "step": 330 }, { "epoch": 1.5133481646273639, "grad_norm": 0.4867636105327538, "learning_rate": 5e-06, "loss": 0.5415, "step": 340 }, { "epoch": 1.557842046718576, "grad_norm": 0.48217592935887066, "learning_rate": 5e-06, "loss": 0.5339, "step": 350 }, { "epoch": 1.6023359288097887, "grad_norm": 0.4650078322874499, "learning_rate": 5e-06, "loss": 0.5295, "step": 360 }, { "epoch": 1.6468298109010011, "grad_norm": 0.570457650374032, "learning_rate": 5e-06, "loss": 0.5333, "step": 370 }, { "epoch": 1.6913236929922135, "grad_norm": 0.5230883279688195, "learning_rate": 5e-06, "loss": 0.5347, "step": 380 }, { "epoch": 1.7358175750834262, "grad_norm": 0.5808698181708927, "learning_rate": 5e-06, "loss": 0.5338, "step": 390 }, { "epoch": 1.7803114571746383, "grad_norm": 0.6131929934071662, "learning_rate": 5e-06, "loss": 0.5392, "step": 400 }, { "epoch": 1.824805339265851, "grad_norm": 0.6516997090789, "learning_rate": 5e-06, "loss": 0.52, "step": 410 }, { "epoch": 1.8692992213570634, "grad_norm": 0.5459884768353754, "learning_rate": 5e-06, "loss": 0.5306, "step": 420 }, { "epoch": 1.9137931034482758, "grad_norm": 0.5136522179463594, "learning_rate": 5e-06, "loss": 0.5369, "step": 430 }, { "epoch": 1.9582869855394884, "grad_norm": 0.5400184881508431, "learning_rate": 5e-06, "loss": 0.5298, "step": 440 }, { "epoch": 1.9983314794215796, "eval_loss": 0.5757958889007568, "eval_runtime": 242.08, "eval_samples_per_second": 25.017, "eval_steps_per_second": 0.392, "step": 449 }, { "epoch": 2.0033370411568407, "grad_norm": 0.7649398226033791, "learning_rate": 5e-06, "loss": 0.5412, "step": 450 }, { "epoch": 2.0478309232480534, "grad_norm": 0.5503149371785776, "learning_rate": 5e-06, "loss": 0.4782, "step": 460 }, { "epoch": 2.092324805339266, "grad_norm": 0.5497911599700889, "learning_rate": 5e-06, "loss": 0.4801, "step": 470 }, { "epoch": 2.136818687430478, "grad_norm": 0.5261406383405891, "learning_rate": 5e-06, "loss": 0.4753, "step": 480 }, { "epoch": 2.181312569521691, "grad_norm": 0.7163879690094836, "learning_rate": 5e-06, "loss": 0.4844, "step": 490 }, { "epoch": 2.225806451612903, "grad_norm": 0.5297217476527686, "learning_rate": 5e-06, "loss": 0.4822, "step": 500 }, { "epoch": 2.2703003337041157, "grad_norm": 0.5953010609328895, "learning_rate": 5e-06, "loss": 0.4899, "step": 510 }, { "epoch": 2.3147942157953283, "grad_norm": 0.5148912353492243, "learning_rate": 5e-06, "loss": 0.4939, "step": 520 }, { "epoch": 2.3592880978865405, "grad_norm": 0.6069152681341892, "learning_rate": 5e-06, "loss": 0.4835, "step": 530 }, { "epoch": 2.403781979977753, "grad_norm": 0.6540469956921977, "learning_rate": 5e-06, "loss": 0.4889, "step": 540 }, { "epoch": 2.4482758620689653, "grad_norm": 0.5359141705186573, "learning_rate": 5e-06, "loss": 0.4865, "step": 550 }, { "epoch": 2.492769744160178, "grad_norm": 0.532021339209882, "learning_rate": 5e-06, "loss": 0.483, "step": 560 }, { "epoch": 2.5372636262513906, "grad_norm": 0.5745397487010325, "learning_rate": 5e-06, "loss": 0.4797, "step": 570 }, { "epoch": 2.5817575083426028, "grad_norm": 0.5721099629533181, "learning_rate": 5e-06, "loss": 0.477, "step": 580 }, { "epoch": 2.6262513904338154, "grad_norm": 0.49094180466012677, "learning_rate": 5e-06, "loss": 0.4833, "step": 590 }, { "epoch": 2.6707452725250276, "grad_norm": 0.4909218814568728, "learning_rate": 5e-06, "loss": 0.4897, "step": 600 }, { "epoch": 2.71523915461624, "grad_norm": 0.5181636597841739, "learning_rate": 5e-06, "loss": 0.4893, "step": 610 }, { "epoch": 2.759733036707453, "grad_norm": 0.46959422158183145, "learning_rate": 5e-06, "loss": 0.4809, "step": 620 }, { "epoch": 2.804226918798665, "grad_norm": 0.5864283756615662, "learning_rate": 5e-06, "loss": 0.4888, "step": 630 }, { "epoch": 2.8487208008898777, "grad_norm": 0.4555937611441811, "learning_rate": 5e-06, "loss": 0.4844, "step": 640 }, { "epoch": 2.89321468298109, "grad_norm": 0.4648877683489992, "learning_rate": 5e-06, "loss": 0.4836, "step": 650 }, { "epoch": 2.9377085650723025, "grad_norm": 0.645091025204656, "learning_rate": 5e-06, "loss": 0.4916, "step": 660 }, { "epoch": 2.982202447163515, "grad_norm": 0.5297809492883717, "learning_rate": 5e-06, "loss": 0.483, "step": 670 }, { "epoch": 2.9911012235817576, "eval_loss": 0.5784014463424683, "eval_runtime": 240.5667, "eval_samples_per_second": 25.174, "eval_steps_per_second": 0.395, "step": 672 }, { "epoch": 2.9911012235817576, "step": 672, "total_flos": 1125415649280000.0, "train_loss": 0.5457742023503497, "train_runtime": 40203.7661, "train_samples_per_second": 8.586, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 672, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1125415649280000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }