| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.99775617053104, | |
| "eval_steps": 100, | |
| "global_step": 501, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05983545250560957, | |
| "grad_norm": 7.40893212326487, | |
| "learning_rate": 1.96078431372549e-06, | |
| "loss": 0.6407, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11967090501121914, | |
| "grad_norm": 2.443879636951024, | |
| "learning_rate": 3.92156862745098e-06, | |
| "loss": 0.3887, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17950635751682872, | |
| "grad_norm": 2.182405695242156, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 0.2616, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.2393418100224383, | |
| "grad_norm": 2.01489573168189, | |
| "learning_rate": 7.84313725490196e-06, | |
| "loss": 0.2571, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2991772625280479, | |
| "grad_norm": 1.979013183070111, | |
| "learning_rate": 9.803921568627451e-06, | |
| "loss": 0.2477, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.35901271503365745, | |
| "grad_norm": 1.549355728093093, | |
| "learning_rate": 9.990133642141359e-06, | |
| "loss": 0.2176, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.418848167539267, | |
| "grad_norm": 1.4426481204872057, | |
| "learning_rate": 9.95607770125771e-06, | |
| "loss": 0.2273, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4786836200448766, | |
| "grad_norm": 1.7789832633152836, | |
| "learning_rate": 9.89787624799672e-06, | |
| "loss": 0.2125, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.5385190725504861, | |
| "grad_norm": 1.529318217095282, | |
| "learning_rate": 9.815812833988292e-06, | |
| "loss": 0.2229, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5983545250560958, | |
| "grad_norm": 1.6259265112226373, | |
| "learning_rate": 9.710287263936485e-06, | |
| "loss": 0.2062, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5983545250560958, | |
| "eval_loss": 0.2123425155878067, | |
| "eval_runtime": 33.4729, | |
| "eval_samples_per_second": 17.776, | |
| "eval_steps_per_second": 8.903, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6581899775617053, | |
| "grad_norm": 1.6245243576341002, | |
| "learning_rate": 9.581813647811199e-06, | |
| "loss": 0.2105, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7180254300673149, | |
| "grad_norm": 1.731561075586601, | |
| "learning_rate": 9.431017896156074e-06, | |
| "loss": 0.2048, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7778608825729244, | |
| "grad_norm": 1.7874480467541498, | |
| "learning_rate": 9.25863467071524e-06, | |
| "loss": 0.2113, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.837696335078534, | |
| "grad_norm": 1.3708463663991368, | |
| "learning_rate": 9.065503805235139e-06, | |
| "loss": 0.1988, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8975317875841436, | |
| "grad_norm": 1.3567660521800535, | |
| "learning_rate": 8.852566213878947e-06, | |
| "loss": 0.2038, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9573672400897532, | |
| "grad_norm": 1.8281708498422444, | |
| "learning_rate": 8.620859307187339e-06, | |
| "loss": 0.2196, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0172026925953628, | |
| "grad_norm": 1.2318054900550177, | |
| "learning_rate": 8.371511937918616e-06, | |
| "loss": 0.1762, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0770381451009723, | |
| "grad_norm": 1.568321912319435, | |
| "learning_rate": 8.105738901391553e-06, | |
| "loss": 0.1288, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.136873597606582, | |
| "grad_norm": 1.3819346363939895, | |
| "learning_rate": 7.82483501712469e-06, | |
| "loss": 0.1214, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1967090501121915, | |
| "grad_norm": 1.2680685647450163, | |
| "learning_rate": 7.530168820605819e-06, | |
| "loss": 0.1256, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1967090501121915, | |
| "eval_loss": 0.20358169078826904, | |
| "eval_runtime": 32.7594, | |
| "eval_samples_per_second": 18.163, | |
| "eval_steps_per_second": 9.097, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.256544502617801, | |
| "grad_norm": 1.2942802177914767, | |
| "learning_rate": 7.223175895924638e-06, | |
| "loss": 0.1241, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3163799551234106, | |
| "grad_norm": 1.4364370392498633, | |
| "learning_rate": 6.905351881751372e-06, | |
| "loss": 0.1254, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.37621540762902, | |
| "grad_norm": 1.330811194933078, | |
| "learning_rate": 6.578245184735513e-06, | |
| "loss": 0.1229, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4360508601346298, | |
| "grad_norm": 1.304831888309303, | |
| "learning_rate": 6.243449435824276e-06, | |
| "loss": 0.1147, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.4958863126402393, | |
| "grad_norm": 1.2398683599838292, | |
| "learning_rate": 5.902595726252801e-06, | |
| "loss": 0.1345, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.555721765145849, | |
| "grad_norm": 1.3240317320353998, | |
| "learning_rate": 5.557344661031628e-06, | |
| "loss": 0.1236, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.6155572176514585, | |
| "grad_norm": 1.518581095835922, | |
| "learning_rate": 5.209378268645998e-06, | |
| "loss": 0.1218, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.675392670157068, | |
| "grad_norm": 1.5653129689570715, | |
| "learning_rate": 4.860391806382157e-06, | |
| "loss": 0.1246, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.7352281226626776, | |
| "grad_norm": 1.4836280079781416, | |
| "learning_rate": 4.512085501204254e-06, | |
| "loss": 0.1156, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.795063575168287, | |
| "grad_norm": 1.4998045733125407, | |
| "learning_rate": 4.166156266419489e-06, | |
| "loss": 0.1296, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.795063575168287, | |
| "eval_loss": 0.19370371103286743, | |
| "eval_runtime": 33.117, | |
| "eval_samples_per_second": 17.967, | |
| "eval_steps_per_second": 8.998, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.8548990276738968, | |
| "grad_norm": 1.426847361331521, | |
| "learning_rate": 3.82428943448705e-06, | |
| "loss": 0.1294, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.9147344801795063, | |
| "grad_norm": 1.1812939999353123, | |
| "learning_rate": 3.488150546247778e-06, | |
| "loss": 0.1219, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.974569932685116, | |
| "grad_norm": 1.071812010046448, | |
| "learning_rate": 3.1593772365766107e-06, | |
| "loss": 0.1106, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.0344053851907256, | |
| "grad_norm": 0.9913151474800547, | |
| "learning_rate": 2.839571255990088e-06, | |
| "loss": 0.0851, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.094240837696335, | |
| "grad_norm": 1.0937548000001698, | |
| "learning_rate": 2.5302906670788463e-06, | |
| "loss": 0.0621, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.1540762902019446, | |
| "grad_norm": 1.1416547973943143, | |
| "learning_rate": 2.23304225378328e-06, | |
| "loss": 0.0662, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.213911742707554, | |
| "grad_norm": 1.2971227147360092, | |
| "learning_rate": 1.9492741804936623e-06, | |
| "loss": 0.0623, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.273747195213164, | |
| "grad_norm": 1.0599796926376819, | |
| "learning_rate": 1.680368936738792e-06, | |
| "loss": 0.0604, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.3335826477187736, | |
| "grad_norm": 1.0516418140346255, | |
| "learning_rate": 1.4276366018359845e-06, | |
| "loss": 0.0605, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.393418100224383, | |
| "grad_norm": 1.1322674065288456, | |
| "learning_rate": 1.1923084623163172e-06, | |
| "loss": 0.0592, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.393418100224383, | |
| "eval_loss": 0.21845205128192902, | |
| "eval_runtime": 33.1015, | |
| "eval_samples_per_second": 17.975, | |
| "eval_steps_per_second": 9.003, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.4532535527299926, | |
| "grad_norm": 1.070168334944103, | |
| "learning_rate": 9.7553101322043e-07, | |
| "loss": 0.0595, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.513089005235602, | |
| "grad_norm": 1.1779521251420957, | |
| "learning_rate": 7.783603724899258e-07, | |
| "loss": 0.0593, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.5729244577412116, | |
| "grad_norm": 1.0392248842745917, | |
| "learning_rate": 6.017571356669183e-07, | |
| "loss": 0.0588, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.632759910246821, | |
| "grad_norm": 1.1454410326378197, | |
| "learning_rate": 4.4658169596911493e-07, | |
| "loss": 0.0599, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.6925953627524306, | |
| "grad_norm": 1.2414977662809759, | |
| "learning_rate": 3.135900525405428e-07, | |
| "loss": 0.0596, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.75243081525804, | |
| "grad_norm": 0.9458175216720401, | |
| "learning_rate": 2.0343012729971244e-07, | |
| "loss": 0.0561, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.81226626776365, | |
| "grad_norm": 1.2677156882489358, | |
| "learning_rate": 1.166386083291604e-07, | |
| "loss": 0.0566, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.8721017202692596, | |
| "grad_norm": 1.0894838908576727, | |
| "learning_rate": 5.363833518505834e-08, | |
| "loss": 0.0608, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.931937172774869, | |
| "grad_norm": 0.8300865767642356, | |
| "learning_rate": 1.4736238865398766e-08, | |
| "loss": 0.0548, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.9917726252804786, | |
| "grad_norm": 1.1224586519889344, | |
| "learning_rate": 1.2184647302626585e-10, | |
| "loss": 0.0646, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.9917726252804786, | |
| "eval_loss": 0.21988336741924286, | |
| "eval_runtime": 33.0412, | |
| "eval_samples_per_second": 18.008, | |
| "eval_steps_per_second": 9.019, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.99775617053104, | |
| "step": 501, | |
| "total_flos": 9861900926976.0, | |
| "train_loss": 0.14637824013353345, | |
| "train_runtime": 2877.8871, | |
| "train_samples_per_second": 5.575, | |
| "train_steps_per_second": 0.174 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 501, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9861900926976.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |