{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 750.0, "eval_steps": 100, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 12.5, "grad_norm": 0.9450863599777222, "learning_rate": 5.94e-05, "loss": 3.582935485839844, "step": 100 }, { "epoch": 25.0, "grad_norm": 0.19262371957302094, "learning_rate": 0.0001194, "loss": 0.49665924072265627, "step": 200 }, { "epoch": 37.5, "grad_norm": 0.15991109609603882, "learning_rate": 0.00017939999999999997, "loss": 0.4021767044067383, "step": 300 }, { "epoch": 50.0, "grad_norm": 0.18609966337680817, "learning_rate": 0.0002394, "loss": 0.37090412139892576, "step": 400 }, { "epoch": 62.5, "grad_norm": 0.24805887043476105, "learning_rate": 0.00029939999999999996, "loss": 0.3474049758911133, "step": 500 }, { "epoch": 75.0, "grad_norm": 0.17464160919189453, "learning_rate": 0.0002980838709677419, "loss": 0.3306478118896484, "step": 600 }, { "epoch": 87.5, "grad_norm": 0.24544841051101685, "learning_rate": 0.00029614838709677416, "loss": 0.3101034927368164, "step": 700 }, { "epoch": 100.0, "grad_norm": 0.2018628716468811, "learning_rate": 0.00029421290322580645, "loss": 0.2937062835693359, "step": 800 }, { "epoch": 112.5, "grad_norm": 0.2136959582567215, "learning_rate": 0.0002922774193548387, "loss": 0.2772307777404785, "step": 900 }, { "epoch": 125.0, "grad_norm": 0.23597952723503113, "learning_rate": 0.0002903419354838709, "loss": 0.258614501953125, "step": 1000 }, { "epoch": 137.5, "grad_norm": 0.30438488721847534, "learning_rate": 0.0002884064516129032, "loss": 0.2398568344116211, "step": 1100 }, { "epoch": 150.0, "grad_norm": 0.27026715874671936, "learning_rate": 0.00028647096774193546, "loss": 0.21622713088989257, "step": 1200 }, { "epoch": 162.5, "grad_norm": 0.2623114287853241, "learning_rate": 0.0002845354838709677, "loss": 0.19229209899902344, "step": 1300 }, { "epoch": 175.0, "grad_norm": 0.34945833683013916, "learning_rate": 0.0002826, "loss": 0.16757678985595703, "step": 1400 }, { "epoch": 187.5, "grad_norm": 0.29883235692977905, "learning_rate": 0.0002806645161290322, "loss": 0.14341635704040528, "step": 1500 }, { "epoch": 200.0, "grad_norm": 0.31376898288726807, "learning_rate": 0.0002787290322580645, "loss": 0.1221920394897461, "step": 1600 }, { "epoch": 212.5, "grad_norm": 0.28367292881011963, "learning_rate": 0.00027679354838709675, "loss": 0.1032716178894043, "step": 1700 }, { "epoch": 225.0, "grad_norm": 0.2790682315826416, "learning_rate": 0.000274858064516129, "loss": 0.08668439865112304, "step": 1800 }, { "epoch": 237.5, "grad_norm": 0.2293432205915451, "learning_rate": 0.0002729225806451613, "loss": 0.0720753002166748, "step": 1900 }, { "epoch": 250.0, "grad_norm": 0.27616050839424133, "learning_rate": 0.0002709870967741935, "loss": 0.062033796310424806, "step": 2000 }, { "epoch": 262.5, "grad_norm": 0.2692248225212097, "learning_rate": 0.0002690516129032258, "loss": 0.05264517307281494, "step": 2100 }, { "epoch": 275.0, "grad_norm": 0.21932683885097504, "learning_rate": 0.00026711612903225805, "loss": 0.045755772590637206, "step": 2200 }, { "epoch": 287.5, "grad_norm": 0.20013022422790527, "learning_rate": 0.0002651806451612903, "loss": 0.04000330924987793, "step": 2300 }, { "epoch": 300.0, "grad_norm": 0.16391867399215698, "learning_rate": 0.0002632451612903226, "loss": 0.03490618944168091, "step": 2400 }, { "epoch": 312.5, "grad_norm": 0.19230681657791138, "learning_rate": 0.0002613096774193548, "loss": 0.031311240196228024, "step": 2500 }, { "epoch": 325.0, "grad_norm": 0.1750553548336029, "learning_rate": 0.00025937419354838705, "loss": 0.02794300317764282, "step": 2600 }, { "epoch": 337.5, "grad_norm": 0.2386818677186966, "learning_rate": 0.00025743870967741934, "loss": 0.02533245801925659, "step": 2700 }, { "epoch": 350.0, "grad_norm": 0.15160086750984192, "learning_rate": 0.00025550322580645163, "loss": 0.023450531959533692, "step": 2800 }, { "epoch": 362.5, "grad_norm": 0.15656723082065582, "learning_rate": 0.00025356774193548387, "loss": 0.02200608015060425, "step": 2900 }, { "epoch": 375.0, "grad_norm": 0.14112432301044464, "learning_rate": 0.0002516322580645161, "loss": 0.020031318664550782, "step": 3000 }, { "epoch": 387.5, "grad_norm": 0.12787914276123047, "learning_rate": 0.00024969677419354834, "loss": 0.01899993300437927, "step": 3100 }, { "epoch": 400.0, "grad_norm": 0.1528688669204712, "learning_rate": 0.00024776129032258063, "loss": 0.01810125231742859, "step": 3200 }, { "epoch": 412.5, "grad_norm": 0.15528737008571625, "learning_rate": 0.00024582580645161287, "loss": 0.016684828996658324, "step": 3300 }, { "epoch": 425.0, "grad_norm": 0.1281791627407074, "learning_rate": 0.00024389032258064514, "loss": 0.015172331333160401, "step": 3400 }, { "epoch": 437.5, "grad_norm": 0.11617272347211838, "learning_rate": 0.0002419548387096774, "loss": 0.01434700846672058, "step": 3500 }, { "epoch": 450.0, "grad_norm": 0.11877749860286713, "learning_rate": 0.00024001935483870966, "loss": 0.01436853289604187, "step": 3600 }, { "epoch": 462.5, "grad_norm": 0.11250139772891998, "learning_rate": 0.00023808387096774193, "loss": 0.013647955656051636, "step": 3700 }, { "epoch": 475.0, "grad_norm": 0.12692750990390778, "learning_rate": 0.00023614838709677417, "loss": 0.012936822175979613, "step": 3800 }, { "epoch": 487.5, "grad_norm": 0.08776593208312988, "learning_rate": 0.00023421290322580643, "loss": 0.01205775499343872, "step": 3900 }, { "epoch": 500.0, "grad_norm": 0.08575516194105148, "learning_rate": 0.00023227741935483867, "loss": 0.012118096351623536, "step": 4000 }, { "epoch": 512.5, "grad_norm": 0.11763694882392883, "learning_rate": 0.00023034193548387093, "loss": 0.010872763395309449, "step": 4100 }, { "epoch": 525.0, "grad_norm": 0.11833110451698303, "learning_rate": 0.00022840645161290322, "loss": 0.010899600982666015, "step": 4200 }, { "epoch": 537.5, "grad_norm": 0.11374954879283905, "learning_rate": 0.00022647096774193546, "loss": 0.010327227115631103, "step": 4300 }, { "epoch": 550.0, "grad_norm": 0.10840512067079544, "learning_rate": 0.00022453548387096773, "loss": 0.010270411968231202, "step": 4400 }, { "epoch": 562.5, "grad_norm": 0.07199712842702866, "learning_rate": 0.0002226, "loss": 0.009772901535034179, "step": 4500 }, { "epoch": 575.0, "grad_norm": 0.15016108751296997, "learning_rate": 0.00022066451612903223, "loss": 0.009401602745056152, "step": 4600 }, { "epoch": 587.5, "grad_norm": 0.08698810636997223, "learning_rate": 0.0002187290322580645, "loss": 0.00952852725982666, "step": 4700 }, { "epoch": 600.0, "grad_norm": 0.11057093739509583, "learning_rate": 0.00021679354838709678, "loss": 0.008922239542007446, "step": 4800 }, { "epoch": 612.5, "grad_norm": 0.11917728185653687, "learning_rate": 0.00021485806451612902, "loss": 0.008766108751296997, "step": 4900 }, { "epoch": 625.0, "grad_norm": 0.07486002892255783, "learning_rate": 0.00021292258064516128, "loss": 0.008633826971054076, "step": 5000 }, { "epoch": 637.5, "grad_norm": 0.11766602843999863, "learning_rate": 0.00021098709677419352, "loss": 0.008536132574081421, "step": 5100 }, { "epoch": 650.0, "grad_norm": 0.0582246296107769, "learning_rate": 0.00020905161290322579, "loss": 0.008086669445037841, "step": 5200 }, { "epoch": 662.5, "grad_norm": 0.0658862367272377, "learning_rate": 0.00020711612903225805, "loss": 0.007744500637054444, "step": 5300 }, { "epoch": 675.0, "grad_norm": 0.10022356361150742, "learning_rate": 0.0002051806451612903, "loss": 0.007699260115623474, "step": 5400 }, { "epoch": 687.5, "grad_norm": 0.12475644052028656, "learning_rate": 0.00020324516129032258, "loss": 0.0076804465055465695, "step": 5500 }, { "epoch": 700.0, "grad_norm": 0.12272350490093231, "learning_rate": 0.00020130967741935484, "loss": 0.007605299353599548, "step": 5600 }, { "epoch": 712.5, "grad_norm": 0.08131624013185501, "learning_rate": 0.00019937419354838708, "loss": 0.0075324904918670655, "step": 5700 }, { "epoch": 725.0, "grad_norm": 0.12169747799634933, "learning_rate": 0.00019743870967741935, "loss": 0.0071774739027023315, "step": 5800 }, { "epoch": 737.5, "grad_norm": 0.05529671907424927, "learning_rate": 0.00019550322580645158, "loss": 0.007025536298751831, "step": 5900 }, { "epoch": 750.0, "grad_norm": 0.07039643824100494, "learning_rate": 0.00019356774193548385, "loss": 0.006921111941337586, "step": 6000 } ], "logging_steps": 100, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 2000, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7074582945792000.0, "train_batch_size": 125, "trial_name": null, "trial_params": null }