| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 819, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03663003663003663, |
| "grad_norm": 2.6755562756699156, |
| "learning_rate": 5e-06, |
| "loss": 1.035, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07326007326007326, |
| "grad_norm": 5.71293803806716, |
| "learning_rate": 5e-06, |
| "loss": 0.9104, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10989010989010989, |
| "grad_norm": 1.2085741029349113, |
| "learning_rate": 5e-06, |
| "loss": 0.8736, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14652014652014653, |
| "grad_norm": 1.059768802396023, |
| "learning_rate": 5e-06, |
| "loss": 0.8444, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18315018315018314, |
| "grad_norm": 1.2824869965245793, |
| "learning_rate": 5e-06, |
| "loss": 0.8236, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21978021978021978, |
| "grad_norm": 1.0163565118914315, |
| "learning_rate": 5e-06, |
| "loss": 0.8101, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2564102564102564, |
| "grad_norm": 0.8142514953905634, |
| "learning_rate": 5e-06, |
| "loss": 0.798, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.29304029304029305, |
| "grad_norm": 0.78627587936006, |
| "learning_rate": 5e-06, |
| "loss": 0.7823, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.32967032967032966, |
| "grad_norm": 1.219228058219057, |
| "learning_rate": 5e-06, |
| "loss": 0.7843, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3663003663003663, |
| "grad_norm": 0.856017546736893, |
| "learning_rate": 5e-06, |
| "loss": 0.7814, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.40293040293040294, |
| "grad_norm": 0.8398588201896897, |
| "learning_rate": 5e-06, |
| "loss": 0.7689, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.43956043956043955, |
| "grad_norm": 0.6452439779107471, |
| "learning_rate": 5e-06, |
| "loss": 0.7629, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.47619047619047616, |
| "grad_norm": 0.7382887179450202, |
| "learning_rate": 5e-06, |
| "loss": 0.764, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5128205128205128, |
| "grad_norm": 0.8641898972221077, |
| "learning_rate": 5e-06, |
| "loss": 0.7579, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5494505494505495, |
| "grad_norm": 0.6968648193284981, |
| "learning_rate": 5e-06, |
| "loss": 0.7583, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5860805860805861, |
| "grad_norm": 0.7876021013623588, |
| "learning_rate": 5e-06, |
| "loss": 0.7603, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6227106227106227, |
| "grad_norm": 0.6628535279447354, |
| "learning_rate": 5e-06, |
| "loss": 0.7549, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6593406593406593, |
| "grad_norm": 0.5808515472437231, |
| "learning_rate": 5e-06, |
| "loss": 0.7532, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6959706959706959, |
| "grad_norm": 0.6897807179523675, |
| "learning_rate": 5e-06, |
| "loss": 0.7538, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7326007326007326, |
| "grad_norm": 0.6799003012866335, |
| "learning_rate": 5e-06, |
| "loss": 0.7468, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7692307692307693, |
| "grad_norm": 0.6218959361844152, |
| "learning_rate": 5e-06, |
| "loss": 0.7472, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8058608058608059, |
| "grad_norm": 0.7883865245529573, |
| "learning_rate": 5e-06, |
| "loss": 0.7439, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8424908424908425, |
| "grad_norm": 0.827056331164966, |
| "learning_rate": 5e-06, |
| "loss": 0.7387, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8791208791208791, |
| "grad_norm": 0.8475964615734182, |
| "learning_rate": 5e-06, |
| "loss": 0.7457, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9157509157509157, |
| "grad_norm": 0.6631462090502517, |
| "learning_rate": 5e-06, |
| "loss": 0.7432, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9523809523809523, |
| "grad_norm": 0.7263929092757935, |
| "learning_rate": 5e-06, |
| "loss": 0.7394, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.989010989010989, |
| "grad_norm": 0.6555923863615204, |
| "learning_rate": 5e-06, |
| "loss": 0.7388, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.7403956651687622, |
| "eval_runtime": 27.4968, |
| "eval_samples_per_second": 266.758, |
| "eval_steps_per_second": 1.055, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.0256410256410255, |
| "grad_norm": 0.7513303784704211, |
| "learning_rate": 5e-06, |
| "loss": 0.7059, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0622710622710623, |
| "grad_norm": 0.9557884526645677, |
| "learning_rate": 5e-06, |
| "loss": 0.6889, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.098901098901099, |
| "grad_norm": 0.632082904582309, |
| "learning_rate": 5e-06, |
| "loss": 0.6931, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1355311355311355, |
| "grad_norm": 0.7054504488885992, |
| "learning_rate": 5e-06, |
| "loss": 0.692, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1721611721611722, |
| "grad_norm": 0.9218108981577618, |
| "learning_rate": 5e-06, |
| "loss": 0.6906, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2087912087912087, |
| "grad_norm": 0.6597799156184799, |
| "learning_rate": 5e-06, |
| "loss": 0.6875, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2454212454212454, |
| "grad_norm": 0.5920093744179588, |
| "learning_rate": 5e-06, |
| "loss": 0.6854, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.282051282051282, |
| "grad_norm": 0.6066160644410351, |
| "learning_rate": 5e-06, |
| "loss": 0.6848, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.3186813186813187, |
| "grad_norm": 0.7191350945649401, |
| "learning_rate": 5e-06, |
| "loss": 0.6894, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3553113553113554, |
| "grad_norm": 0.6853021132203265, |
| "learning_rate": 5e-06, |
| "loss": 0.6858, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3919413919413919, |
| "grad_norm": 0.6295828110796835, |
| "learning_rate": 5e-06, |
| "loss": 0.6906, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4285714285714286, |
| "grad_norm": 0.6225605880403292, |
| "learning_rate": 5e-06, |
| "loss": 0.6872, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4652014652014653, |
| "grad_norm": 0.5721545540006036, |
| "learning_rate": 5e-06, |
| "loss": 0.6844, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.5018315018315018, |
| "grad_norm": 0.6495541464106162, |
| "learning_rate": 5e-06, |
| "loss": 0.687, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5384615384615383, |
| "grad_norm": 0.6180233995820079, |
| "learning_rate": 5e-06, |
| "loss": 0.6836, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.575091575091575, |
| "grad_norm": 0.5924680089205238, |
| "learning_rate": 5e-06, |
| "loss": 0.6881, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6117216117216118, |
| "grad_norm": 0.7715824466204046, |
| "learning_rate": 5e-06, |
| "loss": 0.6925, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.6483516483516483, |
| "grad_norm": 0.5635809156525354, |
| "learning_rate": 5e-06, |
| "loss": 0.6808, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.684981684981685, |
| "grad_norm": 0.7211075324292373, |
| "learning_rate": 5e-06, |
| "loss": 0.6841, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.7216117216117217, |
| "grad_norm": 0.7574566804669711, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7582417582417582, |
| "grad_norm": 0.714666214669077, |
| "learning_rate": 5e-06, |
| "loss": 0.6837, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.7948717948717947, |
| "grad_norm": 0.6715552536826968, |
| "learning_rate": 5e-06, |
| "loss": 0.6883, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8315018315018317, |
| "grad_norm": 0.8501456353850813, |
| "learning_rate": 5e-06, |
| "loss": 0.6824, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8681318681318682, |
| "grad_norm": 0.8210564757608708, |
| "learning_rate": 5e-06, |
| "loss": 0.6826, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.9047619047619047, |
| "grad_norm": 0.6233607070658583, |
| "learning_rate": 5e-06, |
| "loss": 0.6823, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9413919413919414, |
| "grad_norm": 0.6807825420722514, |
| "learning_rate": 5e-06, |
| "loss": 0.6885, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.978021978021978, |
| "grad_norm": 0.6629344076568074, |
| "learning_rate": 5e-06, |
| "loss": 0.6824, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.730824887752533, |
| "eval_runtime": 27.3193, |
| "eval_samples_per_second": 268.491, |
| "eval_steps_per_second": 1.062, |
| "step": 546 |
| }, |
| { |
| "epoch": 2.0146520146520146, |
| "grad_norm": 0.9129368210722825, |
| "learning_rate": 5e-06, |
| "loss": 0.6558, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.051282051282051, |
| "grad_norm": 0.8503780606175464, |
| "learning_rate": 5e-06, |
| "loss": 0.6295, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.087912087912088, |
| "grad_norm": 0.7964894282266023, |
| "learning_rate": 5e-06, |
| "loss": 0.6286, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.1245421245421245, |
| "grad_norm": 0.7422264580205383, |
| "learning_rate": 5e-06, |
| "loss": 0.6314, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.161172161172161, |
| "grad_norm": 0.8546952236590375, |
| "learning_rate": 5e-06, |
| "loss": 0.6283, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.197802197802198, |
| "grad_norm": 0.5776994000174623, |
| "learning_rate": 5e-06, |
| "loss": 0.6314, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.2344322344322345, |
| "grad_norm": 1.0037526840488868, |
| "learning_rate": 5e-06, |
| "loss": 0.6335, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.271062271062271, |
| "grad_norm": 0.7062352774981094, |
| "learning_rate": 5e-06, |
| "loss": 0.6357, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.3076923076923075, |
| "grad_norm": 0.8407157088448836, |
| "learning_rate": 5e-06, |
| "loss": 0.633, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.3443223443223444, |
| "grad_norm": 0.5822705575557771, |
| "learning_rate": 5e-06, |
| "loss": 0.6349, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.380952380952381, |
| "grad_norm": 0.7810470709804241, |
| "learning_rate": 5e-06, |
| "loss": 0.6358, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.4175824175824174, |
| "grad_norm": 0.6792571477446749, |
| "learning_rate": 5e-06, |
| "loss": 0.634, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.4542124542124544, |
| "grad_norm": 0.6582561005269292, |
| "learning_rate": 5e-06, |
| "loss": 0.6332, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.490842490842491, |
| "grad_norm": 0.7333034032018606, |
| "learning_rate": 5e-06, |
| "loss": 0.636, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.5274725274725274, |
| "grad_norm": 0.6514490686612251, |
| "learning_rate": 5e-06, |
| "loss": 0.6342, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.564102564102564, |
| "grad_norm": 0.7738245201171186, |
| "learning_rate": 5e-06, |
| "loss": 0.6333, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.600732600732601, |
| "grad_norm": 0.6223909134135305, |
| "learning_rate": 5e-06, |
| "loss": 0.6373, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.6373626373626373, |
| "grad_norm": 0.6265203120796818, |
| "learning_rate": 5e-06, |
| "loss": 0.6364, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.6739926739926743, |
| "grad_norm": 0.7274455241516222, |
| "learning_rate": 5e-06, |
| "loss": 0.6382, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.7106227106227108, |
| "grad_norm": 0.7062825571332273, |
| "learning_rate": 5e-06, |
| "loss": 0.638, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.7472527472527473, |
| "grad_norm": 0.7066746525423848, |
| "learning_rate": 5e-06, |
| "loss": 0.6349, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.7838827838827838, |
| "grad_norm": 0.6012743699495499, |
| "learning_rate": 5e-06, |
| "loss": 0.6379, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 0.6253902136875567, |
| "learning_rate": 5e-06, |
| "loss": 0.6337, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.857142857142857, |
| "grad_norm": 0.799587316373445, |
| "learning_rate": 5e-06, |
| "loss": 0.641, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.8937728937728937, |
| "grad_norm": 0.6530718377415161, |
| "learning_rate": 5e-06, |
| "loss": 0.6388, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.9304029304029307, |
| "grad_norm": 0.6532704811463808, |
| "learning_rate": 5e-06, |
| "loss": 0.6369, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.967032967032967, |
| "grad_norm": 0.6925982583545915, |
| "learning_rate": 5e-06, |
| "loss": 0.6421, |
| "step": 810 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.7364696264266968, |
| "eval_runtime": 26.2679, |
| "eval_samples_per_second": 279.238, |
| "eval_steps_per_second": 1.104, |
| "step": 819 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 819, |
| "total_flos": 1371855504015360.0, |
| "train_loss": 0.7025235231979426, |
| "train_runtime": 5539.9882, |
| "train_samples_per_second": 75.468, |
| "train_steps_per_second": 0.148 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 819, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1371855504015360.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|