{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.4454976303317535,
  "eval_steps": 500,
  "global_step": 4000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.08616975441619991,
      "grad_norm": 1.8284112215042114,
      "learning_rate": 1.98e-05,
      "loss": 5.544659423828125,
      "step": 100
    },
    {
      "epoch": 0.17233950883239982,
      "grad_norm": 0.5319015383720398,
      "learning_rate": 3.979999999999999e-05,
      "loss": 3.90488037109375,
      "step": 200
    },
    {
      "epoch": 0.25850926324859974,
      "grad_norm": 1.3977950811386108,
      "learning_rate": 5.98e-05,
      "loss": 3.39756103515625,
      "step": 300
    },
    {
      "epoch": 0.34467901766479964,
      "grad_norm": 1.9291573762893677,
      "learning_rate": 7.98e-05,
      "loss": 3.019991149902344,
      "step": 400
    },
    {
      "epoch": 0.4308487720809996,
      "grad_norm": 1.4095340967178345,
      "learning_rate": 9.979999999999999e-05,
      "loss": 2.815445861816406,
      "step": 500
    },
    {
      "epoch": 0.5170185264971995,
      "grad_norm": 3.2716641426086426,
      "learning_rate": 0.00011979999999999998,
      "loss": 2.6590045166015623,
      "step": 600
    },
    {
      "epoch": 0.6031882809133994,
      "grad_norm": 1.3838716745376587,
      "learning_rate": 0.00013979999999999998,
      "loss": 2.543310089111328,
      "step": 700
    },
    {
      "epoch": 0.6893580353295993,
      "grad_norm": 1.069161057472229,
      "learning_rate": 0.00015979999999999998,
      "loss": 2.396273651123047,
      "step": 800
    },
    {
      "epoch": 0.7755277897457993,
      "grad_norm": 0.8585665822029114,
      "learning_rate": 0.0001798,
      "loss": 2.242165985107422,
      "step": 900
    },
    {
      "epoch": 0.8616975441619992,
      "grad_norm": 0.7467069625854492,
      "learning_rate": 0.0001998,
      "loss": 2.1027012634277344,
      "step": 1000
    },
    {
      "epoch": 0.9478672985781991,
      "grad_norm": 0.5805935859680176,
      "learning_rate": 0.00021979999999999998,
      "loss": 2.037454376220703,
      "step": 1100
    },
    {
      "epoch": 1.033606204222318,
      "grad_norm": 0.5948718786239624,
      "learning_rate": 0.00023979999999999997,
      "loss": 1.9681085205078126,
      "step": 1200
    },
    {
      "epoch": 1.1197759586385179,
      "grad_norm": 0.5413378477096558,
      "learning_rate": 0.00025979999999999997,
      "loss": 1.9135774230957032,
      "step": 1300
    },
    {
      "epoch": 1.2059457130547178,
      "grad_norm": 0.5196030139923096,
      "learning_rate": 0.00027979999999999997,
      "loss": 1.8392716979980468,
      "step": 1400
    },
    {
      "epoch": 1.2921154674709177,
      "grad_norm": 0.49619364738464355,
      "learning_rate": 0.00029979999999999997,
      "loss": 1.8049734497070313,
      "step": 1500
    },
    {
      "epoch": 1.3782852218871176,
      "grad_norm": 0.44414839148521423,
      "learning_rate": 0.000299991068233357,
      "loss": 1.7638165283203124,
      "step": 1600
    },
    {
      "epoch": 1.4644549763033177,
      "grad_norm": 0.46444711089134216,
      "learning_rate": 0.0002999639122316208,
      "loss": 1.7137832641601562,
      "step": 1700
    },
    {
      "epoch": 1.5506247307195173,
      "grad_norm": 0.5176238417625427,
      "learning_rate": 0.0002999185343831476,
      "loss": 1.675589599609375,
      "step": 1800
    },
    {
      "epoch": 1.6367944851357175,
      "grad_norm": 0.4177858829498291,
      "learning_rate": 0.0002998549402017187,
      "loss": 1.6349491882324219,
      "step": 1900
    },
    {
      "epoch": 1.7229642395519171,
      "grad_norm": 0.42198434472084045,
      "learning_rate": 0.0002997731374145493,
      "loss": 1.596505126953125,
      "step": 2000
    },
    {
      "epoch": 1.8091339939681172,
      "grad_norm": 0.4523915946483612,
      "learning_rate": 0.0002996731359613498,
      "loss": 1.5908058166503907,
      "step": 2100
    },
    {
      "epoch": 1.8953037483843171,
      "grad_norm": 0.3901713788509369,
      "learning_rate": 0.0002995549479931178,
      "loss": 1.5610142517089844,
      "step": 2200
    },
    {
      "epoch": 1.981473502800517,
      "grad_norm": 0.41816478967666626,
      "learning_rate": 0.00029941858787066206,
      "loss": 1.5319706726074218,
      "step": 2300
    },
    {
      "epoch": 2.067212408444636,
      "grad_norm": 0.3872755765914917,
      "learning_rate": 0.00029926407216285706,
      "loss": 1.5055549621582032,
      "step": 2400
    },
    {
      "epoch": 2.1533821628608356,
      "grad_norm": 0.4193103611469269,
      "learning_rate": 0.0002990914196446301,
      "loss": 1.4792218017578125,
      "step": 2500
    },
    {
      "epoch": 2.2395519172770357,
      "grad_norm": 0.4024358093738556,
      "learning_rate": 0.00029890065129467986,
      "loss": 1.4786280822753906,
      "step": 2600
    },
    {
      "epoch": 2.325721671693236,
      "grad_norm": 0.37588468194007874,
      "learning_rate": 0.0002986917902929273,
      "loss": 1.4545697021484374,
      "step": 2700
    },
    {
      "epoch": 2.4118914261094355,
      "grad_norm": 0.39736974239349365,
      "learning_rate": 0.0002984648620176991,
      "loss": 1.4498170471191407,
      "step": 2800
    },
    {
      "epoch": 2.4980611805256356,
      "grad_norm": 0.42380592226982117,
      "learning_rate": 0.00029821989404264424,
      "loss": 1.4262150573730468,
      "step": 2900
    },
    {
      "epoch": 2.5842309349418353,
      "grad_norm": 0.411803662776947,
      "learning_rate": 0.00029795691613338307,
      "loss": 1.417086181640625,
      "step": 3000
    },
    {
      "epoch": 2.6704006893580354,
      "grad_norm": 0.3662901818752289,
      "learning_rate": 0.000297675960243891,
      "loss": 1.3942941284179688,
      "step": 3100
    },
    {
      "epoch": 2.756570443774235,
      "grad_norm": 0.3642771244049072,
      "learning_rate": 0.00029737706051261557,
      "loss": 1.38471923828125,
      "step": 3200
    },
    {
      "epoch": 2.842740198190435,
      "grad_norm": 0.4138600826263428,
      "learning_rate": 0.00029706025325832857,
      "loss": 1.3765927124023438,
      "step": 3300
    },
    {
      "epoch": 2.9289099526066353,
      "grad_norm": 0.3687536418437958,
      "learning_rate": 0.0002967255769757127,
      "loss": 1.3617820739746094,
      "step": 3400
    },
    {
      "epoch": 3.014648858250754,
      "grad_norm": 0.3252148926258087,
      "learning_rate": 0.0002963730723306845,
      "loss": 1.3490205383300782,
      "step": 3500
    },
    {
      "epoch": 3.100818612666954,
      "grad_norm": 0.3874260187149048,
      "learning_rate": 0.0002960027821554529,
      "loss": 1.3380169677734375,
      "step": 3600
    },
    {
      "epoch": 3.1869883670831536,
      "grad_norm": 0.37778887152671814,
      "learning_rate": 0.00029561475144331467,
      "loss": 1.3190237426757812,
      "step": 3700
    },
    {
      "epoch": 3.2731581214993537,
      "grad_norm": 0.37266016006469727,
      "learning_rate": 0.00029520902734318766,
      "loss": 1.313209991455078,
      "step": 3800
    },
    {
      "epoch": 3.359327875915554,
      "grad_norm": 0.3792646527290344,
      "learning_rate": 0.00029478565915388153,
      "loss": 1.3055996704101562,
      "step": 3900
    },
    {
      "epoch": 3.4454976303317535,
      "grad_norm": 0.3583495318889618,
      "learning_rate": 0.00029434469831810764,
      "loss": 1.301021728515625,
      "step": 4000
    }
  ],
  "logging_steps": 100,
  "max_steps": 30000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 26,
  "save_steps": 2000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.9825523114901504e+16,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}