{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 590, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08547008547008547, "grad_norm": 4.597737789154053, "learning_rate": 8e-05, "loss": 3.144, "step": 5 }, { "epoch": 0.17094017094017094, "grad_norm": 3.6257293224334717, "learning_rate": 0.00018, "loss": 1.023, "step": 10 }, { "epoch": 0.2564102564102564, "grad_norm": 0.9387032389640808, "learning_rate": 0.00019862068965517243, "loss": 0.3362, "step": 15 }, { "epoch": 0.3418803418803419, "grad_norm": 0.41102728247642517, "learning_rate": 0.00019689655172413795, "loss": 0.1727, "step": 20 }, { "epoch": 0.42735042735042733, "grad_norm": 0.504966676235199, "learning_rate": 0.00019517241379310345, "loss": 0.168, "step": 25 }, { "epoch": 0.5128205128205128, "grad_norm": 0.43475794792175293, "learning_rate": 0.00019344827586206898, "loss": 0.1699, "step": 30 }, { "epoch": 0.5982905982905983, "grad_norm": 0.5228849649429321, "learning_rate": 0.0001917241379310345, "loss": 0.1454, "step": 35 }, { "epoch": 0.6837606837606838, "grad_norm": 0.4822940230369568, "learning_rate": 0.00019, "loss": 0.206, "step": 40 }, { "epoch": 0.7692307692307693, "grad_norm": 0.4049399197101593, "learning_rate": 0.00018827586206896554, "loss": 0.132, "step": 45 }, { "epoch": 0.8547008547008547, "grad_norm": 0.4594310224056244, "learning_rate": 0.00018655172413793104, "loss": 0.1541, "step": 50 }, { "epoch": 0.9401709401709402, "grad_norm": 0.3666519820690155, "learning_rate": 0.00018482758620689654, "loss": 0.1144, "step": 55 }, { "epoch": 1.017094017094017, "grad_norm": 0.16928212344646454, "learning_rate": 0.00018310344827586207, "loss": 0.1208, "step": 60 }, { "epoch": 1.1025641025641026, "grad_norm": 0.09923699498176575, "learning_rate": 0.0001813793103448276, "loss": 0.0761, "step": 65 }, { "epoch": 1.188034188034188, "grad_norm": 0.23626509308815002, "learning_rate": 0.0001796551724137931, "loss": 0.0786, "step": 70 }, { "epoch": 1.2735042735042734, "grad_norm": 0.45735999941825867, "learning_rate": 0.00017793103448275862, "loss": 0.13, "step": 75 }, { "epoch": 1.358974358974359, "grad_norm": 0.24871651828289032, "learning_rate": 0.00017620689655172415, "loss": 0.0814, "step": 80 }, { "epoch": 1.4444444444444444, "grad_norm": 0.21524538099765778, "learning_rate": 0.00017448275862068965, "loss": 0.0737, "step": 85 }, { "epoch": 1.5299145299145298, "grad_norm": 0.4590378701686859, "learning_rate": 0.00017275862068965518, "loss": 0.0955, "step": 90 }, { "epoch": 1.6153846153846154, "grad_norm": 0.7036776542663574, "learning_rate": 0.0001710344827586207, "loss": 0.0671, "step": 95 }, { "epoch": 1.7008547008547008, "grad_norm": 0.26162663102149963, "learning_rate": 0.0001693103448275862, "loss": 0.0828, "step": 100 }, { "epoch": 1.7863247863247862, "grad_norm": 0.4105569124221802, "learning_rate": 0.00016758620689655173, "loss": 0.0768, "step": 105 }, { "epoch": 1.8717948717948718, "grad_norm": 0.3037894666194916, "learning_rate": 0.00016586206896551726, "loss": 0.1149, "step": 110 }, { "epoch": 1.9572649572649574, "grad_norm": 0.19420042634010315, "learning_rate": 0.00016413793103448276, "loss": 0.0635, "step": 115 }, { "epoch": 2.034188034188034, "grad_norm": 0.13855452835559845, "learning_rate": 0.0001624137931034483, "loss": 0.0594, "step": 120 }, { "epoch": 2.1196581196581197, "grad_norm": 0.17749273777008057, "learning_rate": 0.00016068965517241382, "loss": 0.0725, "step": 125 }, { "epoch": 2.2051282051282053, "grad_norm": 0.13107630610466003, "learning_rate": 0.00015896551724137932, "loss": 0.0619, "step": 130 }, { "epoch": 2.2905982905982905, "grad_norm": 0.11133825778961182, "learning_rate": 0.00015724137931034485, "loss": 0.0624, "step": 135 }, { "epoch": 2.376068376068376, "grad_norm": 0.187343031167984, "learning_rate": 0.00015551724137931037, "loss": 0.0581, "step": 140 }, { "epoch": 2.4615384615384617, "grad_norm": 0.27685755491256714, "learning_rate": 0.00015379310344827587, "loss": 0.0613, "step": 145 }, { "epoch": 2.547008547008547, "grad_norm": 0.4320373833179474, "learning_rate": 0.0001520689655172414, "loss": 0.0735, "step": 150 }, { "epoch": 2.6324786324786325, "grad_norm": 0.13862545788288116, "learning_rate": 0.0001503448275862069, "loss": 0.0582, "step": 155 }, { "epoch": 2.717948717948718, "grad_norm": 0.7963452339172363, "learning_rate": 0.00014862068965517243, "loss": 0.0651, "step": 160 }, { "epoch": 2.8034188034188032, "grad_norm": 0.14564156532287598, "learning_rate": 0.00014689655172413793, "loss": 0.0559, "step": 165 }, { "epoch": 2.888888888888889, "grad_norm": 0.15069833397865295, "learning_rate": 0.00014517241379310346, "loss": 0.0529, "step": 170 }, { "epoch": 2.9743589743589745, "grad_norm": 0.3557753562927246, "learning_rate": 0.00014344827586206896, "loss": 0.0773, "step": 175 }, { "epoch": 3.051282051282051, "grad_norm": 0.08716096729040146, "learning_rate": 0.0001417241379310345, "loss": 0.0513, "step": 180 }, { "epoch": 3.1367521367521367, "grad_norm": 0.15282496809959412, "learning_rate": 0.00014, "loss": 0.0621, "step": 185 }, { "epoch": 3.2222222222222223, "grad_norm": 0.09816001355648041, "learning_rate": 0.00013827586206896552, "loss": 0.0648, "step": 190 }, { "epoch": 3.3076923076923075, "grad_norm": 0.13748367130756378, "learning_rate": 0.00013655172413793104, "loss": 0.0485, "step": 195 }, { "epoch": 3.393162393162393, "grad_norm": 0.10656469315290451, "learning_rate": 0.00013482758620689654, "loss": 0.0531, "step": 200 }, { "epoch": 3.4786324786324787, "grad_norm": 0.1901499480009079, "learning_rate": 0.00013310344827586207, "loss": 0.0612, "step": 205 }, { "epoch": 3.564102564102564, "grad_norm": 0.16148889064788818, "learning_rate": 0.0001313793103448276, "loss": 0.0546, "step": 210 }, { "epoch": 3.6495726495726495, "grad_norm": 0.19384047389030457, "learning_rate": 0.0001296551724137931, "loss": 0.0589, "step": 215 }, { "epoch": 3.735042735042735, "grad_norm": 0.08794084936380386, "learning_rate": 0.00012793103448275863, "loss": 0.0573, "step": 220 }, { "epoch": 3.8205128205128203, "grad_norm": 0.10576070100069046, "learning_rate": 0.00012620689655172415, "loss": 0.0471, "step": 225 }, { "epoch": 3.905982905982906, "grad_norm": 0.08111118525266647, "learning_rate": 0.00012448275862068966, "loss": 0.0572, "step": 230 }, { "epoch": 3.9914529914529915, "grad_norm": 0.4230298101902008, "learning_rate": 0.00012275862068965518, "loss": 0.0617, "step": 235 }, { "epoch": 4.068376068376068, "grad_norm": 0.08736063539981842, "learning_rate": 0.00012103448275862071, "loss": 0.0493, "step": 240 }, { "epoch": 4.153846153846154, "grad_norm": 0.06979858875274658, "learning_rate": 0.00011931034482758621, "loss": 0.0469, "step": 245 }, { "epoch": 4.239316239316239, "grad_norm": 0.10242439806461334, "learning_rate": 0.00011758620689655173, "loss": 0.0508, "step": 250 }, { "epoch": 4.3247863247863245, "grad_norm": 0.11685860902070999, "learning_rate": 0.00011586206896551725, "loss": 0.0522, "step": 255 }, { "epoch": 4.410256410256411, "grad_norm": 0.1084512323141098, "learning_rate": 0.00011413793103448275, "loss": 0.0519, "step": 260 }, { "epoch": 4.495726495726496, "grad_norm": 0.09368503093719482, "learning_rate": 0.00011241379310344828, "loss": 0.0494, "step": 265 }, { "epoch": 4.581196581196581, "grad_norm": 0.1777074784040451, "learning_rate": 0.00011068965517241381, "loss": 0.0515, "step": 270 }, { "epoch": 4.666666666666667, "grad_norm": 0.056768111884593964, "learning_rate": 0.00010896551724137931, "loss": 0.044, "step": 275 }, { "epoch": 4.752136752136752, "grad_norm": 0.08062291890382767, "learning_rate": 0.00010724137931034484, "loss": 0.0476, "step": 280 }, { "epoch": 4.837606837606837, "grad_norm": 0.09975454211235046, "learning_rate": 0.00010551724137931037, "loss": 0.0522, "step": 285 }, { "epoch": 4.923076923076923, "grad_norm": 0.14652380347251892, "learning_rate": 0.00010379310344827587, "loss": 0.0498, "step": 290 }, { "epoch": 5.0, "grad_norm": 0.12409216165542603, "learning_rate": 0.0001020689655172414, "loss": 0.0474, "step": 295 }, { "epoch": 5.085470085470085, "grad_norm": 0.09494274109601974, "learning_rate": 0.0001003448275862069, "loss": 0.0462, "step": 300 }, { "epoch": 5.170940170940171, "grad_norm": 0.1240062415599823, "learning_rate": 9.862068965517242e-05, "loss": 0.0425, "step": 305 }, { "epoch": 5.256410256410256, "grad_norm": 0.1713438332080841, "learning_rate": 9.689655172413794e-05, "loss": 0.0431, "step": 310 }, { "epoch": 5.3418803418803416, "grad_norm": 0.1990644931793213, "learning_rate": 9.517241379310345e-05, "loss": 0.048, "step": 315 }, { "epoch": 5.427350427350428, "grad_norm": 0.09711036831140518, "learning_rate": 9.344827586206896e-05, "loss": 0.0476, "step": 320 }, { "epoch": 5.512820512820513, "grad_norm": 0.11504214257001877, "learning_rate": 9.172413793103448e-05, "loss": 0.0494, "step": 325 }, { "epoch": 5.598290598290598, "grad_norm": 0.08380427211523056, "learning_rate": 9e-05, "loss": 0.047, "step": 330 }, { "epoch": 5.683760683760684, "grad_norm": 0.08641541749238968, "learning_rate": 8.827586206896552e-05, "loss": 0.0457, "step": 335 }, { "epoch": 5.769230769230769, "grad_norm": 0.0935196503996849, "learning_rate": 8.655172413793103e-05, "loss": 0.0489, "step": 340 }, { "epoch": 5.854700854700854, "grad_norm": 0.11386577785015106, "learning_rate": 8.482758620689656e-05, "loss": 0.0479, "step": 345 }, { "epoch": 5.94017094017094, "grad_norm": 0.08249244838953018, "learning_rate": 8.310344827586208e-05, "loss": 0.0469, "step": 350 }, { "epoch": 6.017094017094017, "grad_norm": 0.09115161001682281, "learning_rate": 8.137931034482759e-05, "loss": 0.0455, "step": 355 }, { "epoch": 6.102564102564102, "grad_norm": 0.06610054522752762, "learning_rate": 7.965517241379312e-05, "loss": 0.0432, "step": 360 }, { "epoch": 6.188034188034188, "grad_norm": 0.09798604249954224, "learning_rate": 7.793103448275862e-05, "loss": 0.0442, "step": 365 }, { "epoch": 6.273504273504273, "grad_norm": 0.12107487767934799, "learning_rate": 7.620689655172413e-05, "loss": 0.0418, "step": 370 }, { "epoch": 6.358974358974359, "grad_norm": 0.10651250928640366, "learning_rate": 7.448275862068966e-05, "loss": 0.0437, "step": 375 }, { "epoch": 6.444444444444445, "grad_norm": 0.09335967153310776, "learning_rate": 7.275862068965517e-05, "loss": 0.044, "step": 380 }, { "epoch": 6.52991452991453, "grad_norm": 0.10894130915403366, "learning_rate": 7.103448275862069e-05, "loss": 0.0493, "step": 385 }, { "epoch": 6.615384615384615, "grad_norm": 0.09522519260644913, "learning_rate": 6.931034482758622e-05, "loss": 0.0463, "step": 390 }, { "epoch": 6.700854700854701, "grad_norm": 0.09910976886749268, "learning_rate": 6.758620689655173e-05, "loss": 0.0427, "step": 395 }, { "epoch": 6.786324786324786, "grad_norm": 0.11286190897226334, "learning_rate": 6.586206896551724e-05, "loss": 0.0444, "step": 400 }, { "epoch": 6.871794871794872, "grad_norm": 0.07890793681144714, "learning_rate": 6.413793103448276e-05, "loss": 0.0407, "step": 405 }, { "epoch": 6.957264957264957, "grad_norm": 0.08769431710243225, "learning_rate": 6.241379310344829e-05, "loss": 0.0479, "step": 410 }, { "epoch": 7.034188034188034, "grad_norm": 0.06925784051418304, "learning_rate": 6.068965517241379e-05, "loss": 0.0439, "step": 415 }, { "epoch": 7.119658119658119, "grad_norm": 0.08389502763748169, "learning_rate": 5.896551724137931e-05, "loss": 0.0437, "step": 420 }, { "epoch": 7.205128205128205, "grad_norm": 0.10391002893447876, "learning_rate": 5.7241379310344835e-05, "loss": 0.042, "step": 425 }, { "epoch": 7.2905982905982905, "grad_norm": 0.09842480719089508, "learning_rate": 5.551724137931035e-05, "loss": 0.0407, "step": 430 }, { "epoch": 7.3760683760683765, "grad_norm": 0.09367308020591736, "learning_rate": 5.379310344827586e-05, "loss": 0.0422, "step": 435 }, { "epoch": 7.461538461538462, "grad_norm": 0.11631827801465988, "learning_rate": 5.2068965517241384e-05, "loss": 0.0453, "step": 440 }, { "epoch": 7.547008547008547, "grad_norm": 0.13546331226825714, "learning_rate": 5.03448275862069e-05, "loss": 0.0405, "step": 445 }, { "epoch": 7.632478632478632, "grad_norm": 0.1015164852142334, "learning_rate": 4.862068965517241e-05, "loss": 0.0433, "step": 450 }, { "epoch": 7.717948717948718, "grad_norm": 0.12304691225290298, "learning_rate": 4.689655172413793e-05, "loss": 0.0439, "step": 455 }, { "epoch": 7.803418803418803, "grad_norm": 0.11133451014757156, "learning_rate": 4.5172413793103454e-05, "loss": 0.0404, "step": 460 }, { "epoch": 7.888888888888889, "grad_norm": 0.11199292540550232, "learning_rate": 4.344827586206897e-05, "loss": 0.0401, "step": 465 }, { "epoch": 7.9743589743589745, "grad_norm": 0.10854869335889816, "learning_rate": 4.172413793103448e-05, "loss": 0.047, "step": 470 }, { "epoch": 8.051282051282051, "grad_norm": 0.08034314215183258, "learning_rate": 4e-05, "loss": 0.0372, "step": 475 }, { "epoch": 8.136752136752136, "grad_norm": 0.07888869941234589, "learning_rate": 3.827586206896552e-05, "loss": 0.0374, "step": 480 }, { "epoch": 8.222222222222221, "grad_norm": 0.08299173414707184, "learning_rate": 3.655172413793104e-05, "loss": 0.0415, "step": 485 }, { "epoch": 8.307692307692308, "grad_norm": 0.10082942992448807, "learning_rate": 3.482758620689655e-05, "loss": 0.0431, "step": 490 }, { "epoch": 8.393162393162394, "grad_norm": 0.13129588961601257, "learning_rate": 3.310344827586207e-05, "loss": 0.0381, "step": 495 }, { "epoch": 8.478632478632479, "grad_norm": 0.0956198126077652, "learning_rate": 3.137931034482759e-05, "loss": 0.0391, "step": 500 }, { "epoch": 8.564102564102564, "grad_norm": 0.10935048758983612, "learning_rate": 2.96551724137931e-05, "loss": 0.0415, "step": 505 }, { "epoch": 8.649572649572649, "grad_norm": 0.09700857102870941, "learning_rate": 2.7931034482758622e-05, "loss": 0.042, "step": 510 }, { "epoch": 8.735042735042736, "grad_norm": 0.09681924432516098, "learning_rate": 2.620689655172414e-05, "loss": 0.041, "step": 515 }, { "epoch": 8.820512820512821, "grad_norm": 0.10170122236013412, "learning_rate": 2.4482758620689654e-05, "loss": 0.0404, "step": 520 }, { "epoch": 8.905982905982906, "grad_norm": 0.10559462755918503, "learning_rate": 2.2758620689655175e-05, "loss": 0.0395, "step": 525 }, { "epoch": 8.991452991452991, "grad_norm": 0.11863423138856888, "learning_rate": 2.1034482758620692e-05, "loss": 0.0433, "step": 530 }, { "epoch": 9.068376068376068, "grad_norm": 0.0633588433265686, "learning_rate": 1.9310344827586207e-05, "loss": 0.0383, "step": 535 }, { "epoch": 9.153846153846153, "grad_norm": 0.08409127593040466, "learning_rate": 1.7586206896551724e-05, "loss": 0.038, "step": 540 }, { "epoch": 9.239316239316238, "grad_norm": 0.12133090943098068, "learning_rate": 1.586206896551724e-05, "loss": 0.0366, "step": 545 }, { "epoch": 9.324786324786325, "grad_norm": 0.09883731603622437, "learning_rate": 1.4137931034482759e-05, "loss": 0.0386, "step": 550 }, { "epoch": 9.41025641025641, "grad_norm": 0.20076970756053925, "learning_rate": 1.2413793103448277e-05, "loss": 0.0375, "step": 555 }, { "epoch": 9.495726495726496, "grad_norm": 0.103940449655056, "learning_rate": 1.0689655172413794e-05, "loss": 0.0394, "step": 560 }, { "epoch": 9.581196581196581, "grad_norm": 0.09235844761133194, "learning_rate": 8.96551724137931e-06, "loss": 0.0405, "step": 565 }, { "epoch": 9.666666666666666, "grad_norm": 0.07304095476865768, "learning_rate": 7.241379310344828e-06, "loss": 0.0352, "step": 570 }, { "epoch": 9.752136752136753, "grad_norm": 0.12776847183704376, "learning_rate": 5.517241379310345e-06, "loss": 0.04, "step": 575 }, { "epoch": 9.837606837606838, "grad_norm": 0.11009430885314941, "learning_rate": 3.793103448275862e-06, "loss": 0.0374, "step": 580 }, { "epoch": 9.923076923076923, "grad_norm": 0.13841569423675537, "learning_rate": 2.0689655172413796e-06, "loss": 0.0401, "step": 585 }, { "epoch": 10.0, "grad_norm": 0.1534666121006012, "learning_rate": 3.4482758620689656e-07, "loss": 0.0366, "step": 590 }, { "epoch": 10.0, "step": 590, "total_flos": 9496524054435840.0, "train_loss": 0.09625538042036154, "train_runtime": 681.7113, "train_samples_per_second": 6.85, "train_steps_per_second": 0.865 } ], "logging_steps": 5, "max_steps": 590, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9496524054435840.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }