{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 750.0,
  "eval_steps": 100,
  "global_step": 6000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 12.5,
      "grad_norm": 0.9450863599777222,
      "learning_rate": 5.94e-05,
      "loss": 3.582935485839844,
      "step": 100
    },
    {
      "epoch": 25.0,
      "grad_norm": 0.19262371957302094,
      "learning_rate": 0.0001194,
      "loss": 0.49665924072265627,
      "step": 200
    },
    {
      "epoch": 37.5,
      "grad_norm": 0.15991109609603882,
      "learning_rate": 0.00017939999999999997,
      "loss": 0.4021767044067383,
      "step": 300
    },
    {
      "epoch": 50.0,
      "grad_norm": 0.18609966337680817,
      "learning_rate": 0.0002394,
      "loss": 0.37090412139892576,
      "step": 400
    },
    {
      "epoch": 62.5,
      "grad_norm": 0.24805887043476105,
      "learning_rate": 0.00029939999999999996,
      "loss": 0.3474049758911133,
      "step": 500
    },
    {
      "epoch": 75.0,
      "grad_norm": 0.17464160919189453,
      "learning_rate": 0.0002980838709677419,
      "loss": 0.3306478118896484,
      "step": 600
    },
    {
      "epoch": 87.5,
      "grad_norm": 0.24544841051101685,
      "learning_rate": 0.00029614838709677416,
      "loss": 0.3101034927368164,
      "step": 700
    },
    {
      "epoch": 100.0,
      "grad_norm": 0.2018628716468811,
      "learning_rate": 0.00029421290322580645,
      "loss": 0.2937062835693359,
      "step": 800
    },
    {
      "epoch": 112.5,
      "grad_norm": 0.2136959582567215,
      "learning_rate": 0.0002922774193548387,
      "loss": 0.2772307777404785,
      "step": 900
    },
    {
      "epoch": 125.0,
      "grad_norm": 0.23597952723503113,
      "learning_rate": 0.0002903419354838709,
      "loss": 0.258614501953125,
      "step": 1000
    },
    {
      "epoch": 137.5,
      "grad_norm": 0.30438488721847534,
      "learning_rate": 0.0002884064516129032,
      "loss": 0.2398568344116211,
      "step": 1100
    },
    {
      "epoch": 150.0,
      "grad_norm": 0.27026715874671936,
      "learning_rate": 0.00028647096774193546,
      "loss": 0.21622713088989257,
      "step": 1200
    },
    {
      "epoch": 162.5,
      "grad_norm": 0.2623114287853241,
      "learning_rate": 0.0002845354838709677,
      "loss": 0.19229209899902344,
      "step": 1300
    },
    {
      "epoch": 175.0,
      "grad_norm": 0.34945833683013916,
      "learning_rate": 0.0002826,
      "loss": 0.16757678985595703,
      "step": 1400
    },
    {
      "epoch": 187.5,
      "grad_norm": 0.29883235692977905,
      "learning_rate": 0.0002806645161290322,
      "loss": 0.14341635704040528,
      "step": 1500
    },
    {
      "epoch": 200.0,
      "grad_norm": 0.31376898288726807,
      "learning_rate": 0.0002787290322580645,
      "loss": 0.1221920394897461,
      "step": 1600
    },
    {
      "epoch": 212.5,
      "grad_norm": 0.28367292881011963,
      "learning_rate": 0.00027679354838709675,
      "loss": 0.1032716178894043,
      "step": 1700
    },
    {
      "epoch": 225.0,
      "grad_norm": 0.2790682315826416,
      "learning_rate": 0.000274858064516129,
      "loss": 0.08668439865112304,
      "step": 1800
    },
    {
      "epoch": 237.5,
      "grad_norm": 0.2293432205915451,
      "learning_rate": 0.0002729225806451613,
      "loss": 0.0720753002166748,
      "step": 1900
    },
    {
      "epoch": 250.0,
      "grad_norm": 0.27616050839424133,
      "learning_rate": 0.0002709870967741935,
      "loss": 0.062033796310424806,
      "step": 2000
    },
    {
      "epoch": 262.5,
      "grad_norm": 0.2692248225212097,
      "learning_rate": 0.0002690516129032258,
      "loss": 0.05264517307281494,
      "step": 2100
    },
    {
      "epoch": 275.0,
      "grad_norm": 0.21932683885097504,
      "learning_rate": 0.00026711612903225805,
      "loss": 0.045755772590637206,
      "step": 2200
    },
    {
      "epoch": 287.5,
      "grad_norm": 0.20013022422790527,
      "learning_rate": 0.0002651806451612903,
      "loss": 0.04000330924987793,
      "step": 2300
    },
    {
      "epoch": 300.0,
      "grad_norm": 0.16391867399215698,
      "learning_rate": 0.0002632451612903226,
      "loss": 0.03490618944168091,
      "step": 2400
    },
    {
      "epoch": 312.5,
      "grad_norm": 0.19230681657791138,
      "learning_rate": 0.0002613096774193548,
      "loss": 0.031311240196228024,
      "step": 2500
    },
    {
      "epoch": 325.0,
      "grad_norm": 0.1750553548336029,
      "learning_rate": 0.00025937419354838705,
      "loss": 0.02794300317764282,
      "step": 2600
    },
    {
      "epoch": 337.5,
      "grad_norm": 0.2386818677186966,
      "learning_rate": 0.00025743870967741934,
      "loss": 0.02533245801925659,
      "step": 2700
    },
    {
      "epoch": 350.0,
      "grad_norm": 0.15160086750984192,
      "learning_rate": 0.00025550322580645163,
      "loss": 0.023450531959533692,
      "step": 2800
    },
    {
      "epoch": 362.5,
      "grad_norm": 0.15656723082065582,
      "learning_rate": 0.00025356774193548387,
      "loss": 0.02200608015060425,
      "step": 2900
    },
    {
      "epoch": 375.0,
      "grad_norm": 0.14112432301044464,
      "learning_rate": 0.0002516322580645161,
      "loss": 0.020031318664550782,
      "step": 3000
    },
    {
      "epoch": 387.5,
      "grad_norm": 0.12787914276123047,
      "learning_rate": 0.00024969677419354834,
      "loss": 0.01899993300437927,
      "step": 3100
    },
    {
      "epoch": 400.0,
      "grad_norm": 0.1528688669204712,
      "learning_rate": 0.00024776129032258063,
      "loss": 0.01810125231742859,
      "step": 3200
    },
    {
      "epoch": 412.5,
      "grad_norm": 0.15528737008571625,
      "learning_rate": 0.00024582580645161287,
      "loss": 0.016684828996658324,
      "step": 3300
    },
    {
      "epoch": 425.0,
      "grad_norm": 0.1281791627407074,
      "learning_rate": 0.00024389032258064514,
      "loss": 0.015172331333160401,
      "step": 3400
    },
    {
      "epoch": 437.5,
      "grad_norm": 0.11617272347211838,
      "learning_rate": 0.0002419548387096774,
      "loss": 0.01434700846672058,
      "step": 3500
    },
    {
      "epoch": 450.0,
      "grad_norm": 0.11877749860286713,
      "learning_rate": 0.00024001935483870966,
      "loss": 0.01436853289604187,
      "step": 3600
    },
    {
      "epoch": 462.5,
      "grad_norm": 0.11250139772891998,
      "learning_rate": 0.00023808387096774193,
      "loss": 0.013647955656051636,
      "step": 3700
    },
    {
      "epoch": 475.0,
      "grad_norm": 0.12692750990390778,
      "learning_rate": 0.00023614838709677417,
      "loss": 0.012936822175979613,
      "step": 3800
    },
    {
      "epoch": 487.5,
      "grad_norm": 0.08776593208312988,
      "learning_rate": 0.00023421290322580643,
      "loss": 0.01205775499343872,
      "step": 3900
    },
    {
      "epoch": 500.0,
      "grad_norm": 0.08575516194105148,
      "learning_rate": 0.00023227741935483867,
      "loss": 0.012118096351623536,
      "step": 4000
    },
    {
      "epoch": 512.5,
      "grad_norm": 0.11763694882392883,
      "learning_rate": 0.00023034193548387093,
      "loss": 0.010872763395309449,
      "step": 4100
    },
    {
      "epoch": 525.0,
      "grad_norm": 0.11833110451698303,
      "learning_rate": 0.00022840645161290322,
      "loss": 0.010899600982666015,
      "step": 4200
    },
    {
      "epoch": 537.5,
      "grad_norm": 0.11374954879283905,
      "learning_rate": 0.00022647096774193546,
      "loss": 0.010327227115631103,
      "step": 4300
    },
    {
      "epoch": 550.0,
      "grad_norm": 0.10840512067079544,
      "learning_rate": 0.00022453548387096773,
      "loss": 0.010270411968231202,
      "step": 4400
    },
    {
      "epoch": 562.5,
      "grad_norm": 0.07199712842702866,
      "learning_rate": 0.0002226,
      "loss": 0.009772901535034179,
      "step": 4500
    },
    {
      "epoch": 575.0,
      "grad_norm": 0.15016108751296997,
      "learning_rate": 0.00022066451612903223,
      "loss": 0.009401602745056152,
      "step": 4600
    },
    {
      "epoch": 587.5,
      "grad_norm": 0.08698810636997223,
      "learning_rate": 0.0002187290322580645,
      "loss": 0.00952852725982666,
      "step": 4700
    },
    {
      "epoch": 600.0,
      "grad_norm": 0.11057093739509583,
      "learning_rate": 0.00021679354838709678,
      "loss": 0.008922239542007446,
      "step": 4800
    },
    {
      "epoch": 612.5,
      "grad_norm": 0.11917728185653687,
      "learning_rate": 0.00021485806451612902,
      "loss": 0.008766108751296997,
      "step": 4900
    },
    {
      "epoch": 625.0,
      "grad_norm": 0.07486002892255783,
      "learning_rate": 0.00021292258064516128,
      "loss": 0.008633826971054076,
      "step": 5000
    },
    {
      "epoch": 637.5,
      "grad_norm": 0.11766602843999863,
      "learning_rate": 0.00021098709677419352,
      "loss": 0.008536132574081421,
      "step": 5100
    },
    {
      "epoch": 650.0,
      "grad_norm": 0.0582246296107769,
      "learning_rate": 0.00020905161290322579,
      "loss": 0.008086669445037841,
      "step": 5200
    },
    {
      "epoch": 662.5,
      "grad_norm": 0.0658862367272377,
      "learning_rate": 0.00020711612903225805,
      "loss": 0.007744500637054444,
      "step": 5300
    },
    {
      "epoch": 675.0,
      "grad_norm": 0.10022356361150742,
      "learning_rate": 0.0002051806451612903,
      "loss": 0.007699260115623474,
      "step": 5400
    },
    {
      "epoch": 687.5,
      "grad_norm": 0.12475644052028656,
      "learning_rate": 0.00020324516129032258,
      "loss": 0.0076804465055465695,
      "step": 5500
    },
    {
      "epoch": 700.0,
      "grad_norm": 0.12272350490093231,
      "learning_rate": 0.00020130967741935484,
      "loss": 0.007605299353599548,
      "step": 5600
    },
    {
      "epoch": 712.5,
      "grad_norm": 0.08131624013185501,
      "learning_rate": 0.00019937419354838708,
      "loss": 0.0075324904918670655,
      "step": 5700
    },
    {
      "epoch": 725.0,
      "grad_norm": 0.12169747799634933,
      "learning_rate": 0.00019743870967741935,
      "loss": 0.0071774739027023315,
      "step": 5800
    },
    {
      "epoch": 737.5,
      "grad_norm": 0.05529671907424927,
      "learning_rate": 0.00019550322580645158,
      "loss": 0.007025536298751831,
      "step": 5900
    },
    {
      "epoch": 750.0,
      "grad_norm": 0.07039643824100494,
      "learning_rate": 0.00019356774193548385,
      "loss": 0.006921111941337586,
      "step": 6000
    }
  ],
  "logging_steps": 100,
  "max_steps": 16000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2000,
  "save_steps": 1000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 7074582945792000.0,
  "train_batch_size": 125,
  "trial_name": null,
  "trial_params": null
}