| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.992616899097621, |
| "eval_steps": 500, |
| "global_step": 912, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03281378178835111, |
| "grad_norm": 7.841750232611161, |
| "learning_rate": 5e-06, |
| "loss": 0.8915, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06562756357670221, |
| "grad_norm": 2.8254265829702723, |
| "learning_rate": 5e-06, |
| "loss": 0.7744, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.09844134536505332, |
| "grad_norm": 1.2137270648230942, |
| "learning_rate": 5e-06, |
| "loss": 0.7284, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.13125512715340443, |
| "grad_norm": 1.0119344856669166, |
| "learning_rate": 5e-06, |
| "loss": 0.7015, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.16406890894175555, |
| "grad_norm": 1.1590120587560695, |
| "learning_rate": 5e-06, |
| "loss": 0.6751, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.19688269073010664, |
| "grad_norm": 1.0688354918846648, |
| "learning_rate": 5e-06, |
| "loss": 0.6539, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.22969647251845776, |
| "grad_norm": 0.7962599254830892, |
| "learning_rate": 5e-06, |
| "loss": 0.6416, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.26251025430680885, |
| "grad_norm": 1.3016899088803453, |
| "learning_rate": 5e-06, |
| "loss": 0.6401, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.29532403609515995, |
| "grad_norm": 0.8330196899066211, |
| "learning_rate": 5e-06, |
| "loss": 0.618, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3281378178835111, |
| "grad_norm": 0.667189033403046, |
| "learning_rate": 5e-06, |
| "loss": 0.6196, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3609515996718622, |
| "grad_norm": 0.7289539260596677, |
| "learning_rate": 5e-06, |
| "loss": 0.6297, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3937653814602133, |
| "grad_norm": 0.6409803463860579, |
| "learning_rate": 5e-06, |
| "loss": 0.6216, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4265791632485644, |
| "grad_norm": 0.70735959118925, |
| "learning_rate": 5e-06, |
| "loss": 0.6092, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4593929450369155, |
| "grad_norm": 0.8142813291097145, |
| "learning_rate": 5e-06, |
| "loss": 0.6024, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4922067268252666, |
| "grad_norm": 0.6426092966393037, |
| "learning_rate": 5e-06, |
| "loss": 0.6028, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5250205086136177, |
| "grad_norm": 0.9814251564680376, |
| "learning_rate": 5e-06, |
| "loss": 0.6042, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5578342904019689, |
| "grad_norm": 0.601307551623306, |
| "learning_rate": 5e-06, |
| "loss": 0.6047, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5906480721903199, |
| "grad_norm": 0.5895692038996282, |
| "learning_rate": 5e-06, |
| "loss": 0.5936, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.623461853978671, |
| "grad_norm": 0.5348923163508846, |
| "learning_rate": 5e-06, |
| "loss": 0.5906, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6562756357670222, |
| "grad_norm": 0.6710759767714962, |
| "learning_rate": 5e-06, |
| "loss": 0.5863, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6890894175553732, |
| "grad_norm": 0.5530954428148116, |
| "learning_rate": 5e-06, |
| "loss": 0.5972, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7219031993437244, |
| "grad_norm": 0.6414930112608357, |
| "learning_rate": 5e-06, |
| "loss": 0.5888, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.730574957468606, |
| "learning_rate": 5e-06, |
| "loss": 0.5913, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7875307629204266, |
| "grad_norm": 0.8034131557047212, |
| "learning_rate": 5e-06, |
| "loss": 0.5849, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8203445447087777, |
| "grad_norm": 0.6772004609931918, |
| "learning_rate": 5e-06, |
| "loss": 0.5904, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8531583264971287, |
| "grad_norm": 0.5963848396035898, |
| "learning_rate": 5e-06, |
| "loss": 0.5814, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8859721082854799, |
| "grad_norm": 0.8533728487617155, |
| "learning_rate": 5e-06, |
| "loss": 0.5926, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.918785890073831, |
| "grad_norm": 0.6929371502886701, |
| "learning_rate": 5e-06, |
| "loss": 0.584, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9515996718621821, |
| "grad_norm": 1.0646930858335608, |
| "learning_rate": 5e-06, |
| "loss": 0.583, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9844134536505332, |
| "grad_norm": 0.657886516351812, |
| "learning_rate": 5e-06, |
| "loss": 0.5822, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9975389663658737, |
| "eval_loss": 0.5811063051223755, |
| "eval_runtime": 164.1949, |
| "eval_samples_per_second": 49.989, |
| "eval_steps_per_second": 0.396, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.0172272354388843, |
| "grad_norm": 0.9145633972949372, |
| "learning_rate": 5e-06, |
| "loss": 0.5615, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0500410172272354, |
| "grad_norm": 0.8330439338358776, |
| "learning_rate": 5e-06, |
| "loss": 0.5501, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0828547990155866, |
| "grad_norm": 0.9017808771885161, |
| "learning_rate": 5e-06, |
| "loss": 0.5397, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1156685808039377, |
| "grad_norm": 0.7658877957248926, |
| "learning_rate": 5e-06, |
| "loss": 0.543, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1484823625922886, |
| "grad_norm": 0.559631564116058, |
| "learning_rate": 5e-06, |
| "loss": 0.5402, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1812961443806398, |
| "grad_norm": 0.7555936047357931, |
| "learning_rate": 5e-06, |
| "loss": 0.5424, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.214109926168991, |
| "grad_norm": 0.7751086823302814, |
| "learning_rate": 5e-06, |
| "loss": 0.5338, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.246923707957342, |
| "grad_norm": 1.1576515759835613, |
| "learning_rate": 5e-06, |
| "loss": 0.5447, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2797374897456932, |
| "grad_norm": 0.637673440804281, |
| "learning_rate": 5e-06, |
| "loss": 0.5436, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3125512715340442, |
| "grad_norm": 0.5608427930549431, |
| "learning_rate": 5e-06, |
| "loss": 0.538, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3453650533223955, |
| "grad_norm": 0.6236715534816973, |
| "learning_rate": 5e-06, |
| "loss": 0.5401, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.3781788351107465, |
| "grad_norm": 0.6834154815439046, |
| "learning_rate": 5e-06, |
| "loss": 0.5371, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4109926168990976, |
| "grad_norm": 0.6230411253152371, |
| "learning_rate": 5e-06, |
| "loss": 0.5396, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.4438063986874488, |
| "grad_norm": 0.5735284718821723, |
| "learning_rate": 5e-06, |
| "loss": 0.546, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.4766201804758, |
| "grad_norm": 0.6777341683980052, |
| "learning_rate": 5e-06, |
| "loss": 0.5312, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.509433962264151, |
| "grad_norm": 0.6145373481418245, |
| "learning_rate": 5e-06, |
| "loss": 0.5363, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.542247744052502, |
| "grad_norm": 0.6691485262612896, |
| "learning_rate": 5e-06, |
| "loss": 0.5405, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.5750615258408531, |
| "grad_norm": 0.6278473705145111, |
| "learning_rate": 5e-06, |
| "loss": 0.5336, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.6078753076292043, |
| "grad_norm": 0.7195303747825454, |
| "learning_rate": 5e-06, |
| "loss": 0.5456, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.6406890894175554, |
| "grad_norm": 0.6993265245998453, |
| "learning_rate": 5e-06, |
| "loss": 0.526, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.6735028712059066, |
| "grad_norm": 0.5569198466336033, |
| "learning_rate": 5e-06, |
| "loss": 0.5437, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.7063166529942575, |
| "grad_norm": 0.5962729727283363, |
| "learning_rate": 5e-06, |
| "loss": 0.54, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.7391304347826086, |
| "grad_norm": 0.5528562495704502, |
| "learning_rate": 5e-06, |
| "loss": 0.5348, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.7719442165709598, |
| "grad_norm": 0.5117801135768556, |
| "learning_rate": 5e-06, |
| "loss": 0.5354, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.804757998359311, |
| "grad_norm": 0.6270359146501516, |
| "learning_rate": 5e-06, |
| "loss": 0.5405, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.837571780147662, |
| "grad_norm": 0.49991108374112964, |
| "learning_rate": 5e-06, |
| "loss": 0.5311, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.870385561936013, |
| "grad_norm": 0.8786052238787824, |
| "learning_rate": 5e-06, |
| "loss": 0.5276, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.9031993437243644, |
| "grad_norm": 0.5526276905535635, |
| "learning_rate": 5e-06, |
| "loss": 0.5303, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.9360131255127153, |
| "grad_norm": 0.6424736342148375, |
| "learning_rate": 5e-06, |
| "loss": 0.5378, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.9688269073010665, |
| "grad_norm": 0.4985928762767277, |
| "learning_rate": 5e-06, |
| "loss": 0.5331, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9983593109105824, |
| "eval_loss": 0.5679268836975098, |
| "eval_runtime": 165.1141, |
| "eval_samples_per_second": 49.711, |
| "eval_steps_per_second": 0.394, |
| "step": 609 |
| }, |
| { |
| "epoch": 2.0016406890894176, |
| "grad_norm": 0.8535377767409578, |
| "learning_rate": 5e-06, |
| "loss": 0.5258, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.0344544708777685, |
| "grad_norm": 0.8392464205497018, |
| "learning_rate": 5e-06, |
| "loss": 0.4982, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.06726825266612, |
| "grad_norm": 0.6991396805774981, |
| "learning_rate": 5e-06, |
| "loss": 0.4856, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.100082034454471, |
| "grad_norm": 0.6363700778523905, |
| "learning_rate": 5e-06, |
| "loss": 0.49, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.132895816242822, |
| "grad_norm": 0.6689991208530507, |
| "learning_rate": 5e-06, |
| "loss": 0.49, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.165709598031173, |
| "grad_norm": 0.5725274111655418, |
| "learning_rate": 5e-06, |
| "loss": 0.4926, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.198523379819524, |
| "grad_norm": 0.8141370837443801, |
| "learning_rate": 5e-06, |
| "loss": 0.4876, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.2313371616078754, |
| "grad_norm": 0.6282724145860844, |
| "learning_rate": 5e-06, |
| "loss": 0.4878, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.2641509433962264, |
| "grad_norm": 0.6530073663418564, |
| "learning_rate": 5e-06, |
| "loss": 0.4944, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.2969647251845773, |
| "grad_norm": 0.5699233164866262, |
| "learning_rate": 5e-06, |
| "loss": 0.4966, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.3297785069729287, |
| "grad_norm": 0.610417793848035, |
| "learning_rate": 5e-06, |
| "loss": 0.49, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.3625922887612796, |
| "grad_norm": 0.6289268935164573, |
| "learning_rate": 5e-06, |
| "loss": 0.5036, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.395406070549631, |
| "grad_norm": 0.6537483086050372, |
| "learning_rate": 5e-06, |
| "loss": 0.4927, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.428219852337982, |
| "grad_norm": 0.6648368643526823, |
| "learning_rate": 5e-06, |
| "loss": 0.4983, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.4610336341263332, |
| "grad_norm": 0.6722645125843071, |
| "learning_rate": 5e-06, |
| "loss": 0.4949, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.493847415914684, |
| "grad_norm": 0.5717762614134606, |
| "learning_rate": 5e-06, |
| "loss": 0.5014, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.526661197703035, |
| "grad_norm": 0.6732475102422392, |
| "learning_rate": 5e-06, |
| "loss": 0.4978, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.5594749794913865, |
| "grad_norm": 0.6176484271038984, |
| "learning_rate": 5e-06, |
| "loss": 0.4969, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.5922887612797374, |
| "grad_norm": 0.5794899161431503, |
| "learning_rate": 5e-06, |
| "loss": 0.497, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.6251025430680883, |
| "grad_norm": 0.6254525725000455, |
| "learning_rate": 5e-06, |
| "loss": 0.4881, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.6579163248564397, |
| "grad_norm": 0.4997430504095228, |
| "learning_rate": 5e-06, |
| "loss": 0.4986, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.690730106644791, |
| "grad_norm": 0.6452212092676007, |
| "learning_rate": 5e-06, |
| "loss": 0.4937, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.723543888433142, |
| "grad_norm": 0.6248001192436421, |
| "learning_rate": 5e-06, |
| "loss": 0.5017, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.756357670221493, |
| "grad_norm": 0.5729926597313496, |
| "learning_rate": 5e-06, |
| "loss": 0.4997, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.7891714520098443, |
| "grad_norm": 0.6322042992676079, |
| "learning_rate": 5e-06, |
| "loss": 0.4975, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.821985233798195, |
| "grad_norm": 0.6001289917677007, |
| "learning_rate": 5e-06, |
| "loss": 0.4932, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.854799015586546, |
| "grad_norm": 0.6372644243152534, |
| "learning_rate": 5e-06, |
| "loss": 0.5005, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.8876127973748975, |
| "grad_norm": 0.6486605042735685, |
| "learning_rate": 5e-06, |
| "loss": 0.4996, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.9204265791632484, |
| "grad_norm": 0.5285905512046695, |
| "learning_rate": 5e-06, |
| "loss": 0.5018, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.9532403609516, |
| "grad_norm": 0.6212025899780993, |
| "learning_rate": 5e-06, |
| "loss": 0.4993, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.9860541427399507, |
| "grad_norm": 0.6141734836175889, |
| "learning_rate": 5e-06, |
| "loss": 0.4973, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.992616899097621, |
| "eval_loss": 0.5682421326637268, |
| "eval_runtime": 165.8526, |
| "eval_samples_per_second": 49.49, |
| "eval_steps_per_second": 0.392, |
| "step": 912 |
| }, |
| { |
| "epoch": 2.992616899097621, |
| "step": 912, |
| "total_flos": 1527215208529920.0, |
| "train_loss": 0.5539279884021533, |
| "train_runtime": 27411.4767, |
| "train_samples_per_second": 17.066, |
| "train_steps_per_second": 0.033 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 912, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1527215208529920.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|