| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9911012235817576, | |
| "eval_steps": 500, | |
| "global_step": 672, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04449388209121246, | |
| "grad_norm": 17.919046764867105, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7727, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.08898776418242492, | |
| "grad_norm": 2.5068176722694706, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7067, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.13348164627363737, | |
| "grad_norm": 0.8188606020387743, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6732, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.17797552836484984, | |
| "grad_norm": 0.7944470655707544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6514, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22246941045606228, | |
| "grad_norm": 0.9562761756923293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6433, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.26696329254727474, | |
| "grad_norm": 0.8767792470489679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6295, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.3114571746384872, | |
| "grad_norm": 1.337143723222155, | |
| "learning_rate": 5e-06, | |
| "loss": 0.615, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3559510567296997, | |
| "grad_norm": 0.5940615588218296, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6111, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.40044493882091214, | |
| "grad_norm": 0.6245704159321135, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.44493882091212456, | |
| "grad_norm": 0.5683316859611832, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6087, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.489432703003337, | |
| "grad_norm": 0.5159069244539627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5983, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5339265850945495, | |
| "grad_norm": 0.5072121076395835, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6023, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.578420467185762, | |
| "grad_norm": 0.5274934145200447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5919, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6229143492769744, | |
| "grad_norm": 0.5544559368876572, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5974, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.6674082313681868, | |
| "grad_norm": 0.4929894123721837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5945, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7119021134593994, | |
| "grad_norm": 1.1362385626723797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5892, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.7563959955506118, | |
| "grad_norm": 0.4793988319315333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5856, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8008898776418243, | |
| "grad_norm": 0.5137409156397603, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5879, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.8453837597330367, | |
| "grad_norm": 0.7484432139988203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5897, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.8898776418242491, | |
| "grad_norm": 0.4983308283425539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5754, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9343715239154616, | |
| "grad_norm": 0.5985618062120786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5821, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.978865406006674, | |
| "grad_norm": 0.4660686039944073, | |
| "learning_rate": 5e-06, | |
| "loss": 0.581, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.996662958843159, | |
| "eval_loss": 0.5854274034500122, | |
| "eval_runtime": 240.2197, | |
| "eval_samples_per_second": 25.21, | |
| "eval_steps_per_second": 0.395, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.0239154616240267, | |
| "grad_norm": 0.7457465164497729, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5763, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.068409343715239, | |
| "grad_norm": 0.6326576381402257, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5449, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.1129032258064515, | |
| "grad_norm": 0.6388396264873892, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5301, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1573971078976641, | |
| "grad_norm": 0.752126204770771, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5374, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.2018909899888766, | |
| "grad_norm": 0.4673289459919312, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5314, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.246384872080089, | |
| "grad_norm": 0.5707602520384042, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5371, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.2908787541713014, | |
| "grad_norm": 0.7449098291403021, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5393, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.3353726362625138, | |
| "grad_norm": 0.6185110765439527, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5361, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.3798665183537264, | |
| "grad_norm": 0.6947624284326104, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5353, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.4243604004449388, | |
| "grad_norm": 0.5200108023651202, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5316, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.4688542825361512, | |
| "grad_norm": 0.47510706811194214, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5352, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.5133481646273639, | |
| "grad_norm": 0.4867636105327538, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5415, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.557842046718576, | |
| "grad_norm": 0.48217592935887066, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5339, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.6023359288097887, | |
| "grad_norm": 0.4650078322874499, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5295, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.6468298109010011, | |
| "grad_norm": 0.570457650374032, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5333, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.6913236929922135, | |
| "grad_norm": 0.5230883279688195, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5347, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.7358175750834262, | |
| "grad_norm": 0.5808698181708927, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5338, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.7803114571746383, | |
| "grad_norm": 0.6131929934071662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5392, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.824805339265851, | |
| "grad_norm": 0.6516997090789, | |
| "learning_rate": 5e-06, | |
| "loss": 0.52, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.8692992213570634, | |
| "grad_norm": 0.5459884768353754, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5306, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.9137931034482758, | |
| "grad_norm": 0.5136522179463594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5369, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.9582869855394884, | |
| "grad_norm": 0.5400184881508431, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5298, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.9983314794215796, | |
| "eval_loss": 0.5757958889007568, | |
| "eval_runtime": 242.08, | |
| "eval_samples_per_second": 25.017, | |
| "eval_steps_per_second": 0.392, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 2.0033370411568407, | |
| "grad_norm": 0.7649398226033791, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5412, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.0478309232480534, | |
| "grad_norm": 0.5503149371785776, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4782, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.092324805339266, | |
| "grad_norm": 0.5497911599700889, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4801, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.136818687430478, | |
| "grad_norm": 0.5261406383405891, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4753, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.181312569521691, | |
| "grad_norm": 0.7163879690094836, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4844, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.225806451612903, | |
| "grad_norm": 0.5297217476527686, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4822, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.2703003337041157, | |
| "grad_norm": 0.5953010609328895, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4899, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.3147942157953283, | |
| "grad_norm": 0.5148912353492243, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4939, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.3592880978865405, | |
| "grad_norm": 0.6069152681341892, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4835, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.403781979977753, | |
| "grad_norm": 0.6540469956921977, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4889, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.4482758620689653, | |
| "grad_norm": 0.5359141705186573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4865, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.492769744160178, | |
| "grad_norm": 0.532021339209882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.483, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.5372636262513906, | |
| "grad_norm": 0.5745397487010325, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4797, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.5817575083426028, | |
| "grad_norm": 0.5721099629533181, | |
| "learning_rate": 5e-06, | |
| "loss": 0.477, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.6262513904338154, | |
| "grad_norm": 0.49094180466012677, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4833, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.6707452725250276, | |
| "grad_norm": 0.4909218814568728, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4897, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.71523915461624, | |
| "grad_norm": 0.5181636597841739, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4893, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.759733036707453, | |
| "grad_norm": 0.46959422158183145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4809, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.804226918798665, | |
| "grad_norm": 0.5864283756615662, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4888, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.8487208008898777, | |
| "grad_norm": 0.4555937611441811, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4844, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.89321468298109, | |
| "grad_norm": 0.4648877683489992, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4836, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.9377085650723025, | |
| "grad_norm": 0.645091025204656, | |
| "learning_rate": 5e-06, | |
| "loss": 0.4916, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.982202447163515, | |
| "grad_norm": 0.5297809492883717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.483, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.9911012235817576, | |
| "eval_loss": 0.5784014463424683, | |
| "eval_runtime": 240.5667, | |
| "eval_samples_per_second": 25.174, | |
| "eval_steps_per_second": 0.395, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.9911012235817576, | |
| "step": 672, | |
| "total_flos": 1125415649280000.0, | |
| "train_loss": 0.5457742023503497, | |
| "train_runtime": 40203.7661, | |
| "train_samples_per_second": 8.586, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 672, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1125415649280000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |