{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 5.0,
  "eval_steps": 500,
  "global_step": 40,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.125,
      "grad_norm": 5.211779927637621,
      "learning_rate": 2e-05,
      "loss": 0.9857,
      "step": 1
    },
    {
      "epoch": 0.25,
      "grad_norm": 5.042557082573154,
      "learning_rate": 4e-05,
      "loss": 1.0105,
      "step": 2
    },
    {
      "epoch": 0.375,
      "grad_norm": 1.6016497975651993,
      "learning_rate": 6.000000000000001e-05,
      "loss": 0.8315,
      "step": 3
    },
    {
      "epoch": 0.5,
      "grad_norm": 4.25741909777284,
      "learning_rate": 8e-05,
      "loss": 0.8376,
      "step": 4
    },
    {
      "epoch": 0.625,
      "grad_norm": 4.221010570287036,
      "learning_rate": 7.984778792366983e-05,
      "loss": 0.8615,
      "step": 5
    },
    {
      "epoch": 0.75,
      "grad_norm": 2.7341942887765978,
      "learning_rate": 7.939231012048833e-05,
      "loss": 0.8683,
      "step": 6
    },
    {
      "epoch": 0.875,
      "grad_norm": 2.869440032757376,
      "learning_rate": 7.863703305156273e-05,
      "loss": 0.8362,
      "step": 7
    },
    {
      "epoch": 1.0,
      "grad_norm": 2.122066806364661,
      "learning_rate": 7.758770483143634e-05,
      "loss": 0.784,
      "step": 8
    },
    {
      "epoch": 1.125,
      "grad_norm": 1.439471039607689,
      "learning_rate": 7.625231148146601e-05,
      "loss": 0.7544,
      "step": 9
    },
    {
      "epoch": 1.25,
      "grad_norm": 1.6534051953010969,
      "learning_rate": 7.464101615137756e-05,
      "loss": 0.7631,
      "step": 10
    },
    {
      "epoch": 1.375,
      "grad_norm": 1.8137551875928488,
      "learning_rate": 7.276608177155968e-05,
      "loss": 0.7403,
      "step": 11
    },
    {
      "epoch": 1.5,
      "grad_norm": 1.3142504730332583,
      "learning_rate": 7.064177772475912e-05,
      "loss": 0.7002,
      "step": 12
    },
    {
      "epoch": 1.625,
      "grad_norm": 5.925202262290306,
      "learning_rate": 6.828427124746191e-05,
      "loss": 0.6861,
      "step": 13
    },
    {
      "epoch": 1.75,
      "grad_norm": 1.7227726654448416,
      "learning_rate": 6.571150438746157e-05,
      "loss": 0.7334,
      "step": 14
    },
    {
      "epoch": 1.875,
      "grad_norm": 1.0449895112458119,
      "learning_rate": 6.294305745404185e-05,
      "loss": 0.6708,
      "step": 15
    },
    {
      "epoch": 2.0,
      "grad_norm": 5.096810621766211,
      "learning_rate": 6.000000000000001e-05,
      "loss": 0.6817,
      "step": 16
    },
    {
      "epoch": 2.125,
      "grad_norm": 1.8744800353306919,
      "learning_rate": 5.6904730469627985e-05,
      "loss": 0.6989,
      "step": 17
    },
    {
      "epoch": 2.25,
      "grad_norm": 1.230587778975542,
      "learning_rate": 5.368080573302676e-05,
      "loss": 0.6331,
      "step": 18
    },
    {
      "epoch": 2.375,
      "grad_norm": 8.839539018981332,
      "learning_rate": 5.0352761804100835e-05,
      "loss": 0.6351,
      "step": 19
    },
    {
      "epoch": 2.5,
      "grad_norm": 5.01265204416041,
      "learning_rate": 4.694592710667723e-05,
      "loss": 0.661,
      "step": 20
    },
    {
      "epoch": 2.625,
      "grad_norm": 2.099879032906444,
      "learning_rate": 4.348622970990634e-05,
      "loss": 0.6938,
      "step": 21
    },
    {
      "epoch": 2.75,
      "grad_norm": 1.4954684606615343,
      "learning_rate": 4e-05,
      "loss": 0.6499,
      "step": 22
    },
    {
      "epoch": 2.875,
      "grad_norm": 0.742639587291529,
      "learning_rate": 3.6513770290093674e-05,
      "loss": 0.6396,
      "step": 23
    },
    {
      "epoch": 3.0,
      "grad_norm": 0.7952668726605489,
      "learning_rate": 3.305407289332279e-05,
      "loss": 0.6316,
      "step": 24
    },
    {
      "epoch": 3.125,
      "grad_norm": 0.9332649605547626,
      "learning_rate": 2.9647238195899168e-05,
      "loss": 0.6314,
      "step": 25
    },
    {
      "epoch": 3.25,
      "grad_norm": 0.8319077523070111,
      "learning_rate": 2.6319194266973256e-05,
      "loss": 0.5775,
      "step": 26
    },
    {
      "epoch": 3.375,
      "grad_norm": 0.6043436502068867,
      "learning_rate": 2.3095269530372032e-05,
      "loss": 0.5838,
      "step": 27
    },
    {
      "epoch": 3.5,
      "grad_norm": 0.48007057002472814,
      "learning_rate": 2.0000000000000012e-05,
      "loss": 0.5709,
      "step": 28
    },
    {
      "epoch": 3.625,
      "grad_norm": 0.4637812767567033,
      "learning_rate": 1.7056942545958167e-05,
      "loss": 0.5556,
      "step": 29
    },
    {
      "epoch": 3.75,
      "grad_norm": 0.46472503204848403,
      "learning_rate": 1.4288495612538427e-05,
      "loss": 0.5443,
      "step": 30
    },
    {
      "epoch": 3.875,
      "grad_norm": 0.46363908296159384,
      "learning_rate": 1.1715728752538103e-05,
      "loss": 0.557,
      "step": 31
    },
    {
      "epoch": 4.0,
      "grad_norm": 0.41103161754751094,
      "learning_rate": 9.358222275240884e-06,
      "loss": 0.5874,
      "step": 32
    },
    {
      "epoch": 4.125,
      "grad_norm": 0.3604630063954317,
      "learning_rate": 7.233918228440324e-06,
      "loss": 0.5322,
      "step": 33
    },
    {
      "epoch": 4.25,
      "grad_norm": 0.3152024677309081,
      "learning_rate": 5.358983848622452e-06,
      "loss": 0.5663,
      "step": 34
    },
    {
      "epoch": 4.375,
      "grad_norm": 0.2782735119012474,
      "learning_rate": 3.747688518534003e-06,
      "loss": 0.5334,
      "step": 35
    },
    {
      "epoch": 4.5,
      "grad_norm": 0.2517890470284474,
      "learning_rate": 2.4122951685636674e-06,
      "loss": 0.5227,
      "step": 36
    },
    {
      "epoch": 4.625,
      "grad_norm": 0.2307830936984302,
      "learning_rate": 1.3629669484372722e-06,
      "loss": 0.5605,
      "step": 37
    },
    {
      "epoch": 4.75,
      "grad_norm": 0.2120443064521722,
      "learning_rate": 6.076898795116792e-07,
      "loss": 0.5255,
      "step": 38
    },
    {
      "epoch": 4.875,
      "grad_norm": 0.21220571438664002,
      "learning_rate": 1.522120763301782e-07,
      "loss": 0.5098,
      "step": 39
    },
    {
      "epoch": 5.0,
      "grad_norm": 0.20885459665653022,
      "learning_rate": 0.0,
      "loss": 0.5482,
      "step": 40
    },
    {
      "epoch": 5.0,
      "step": 40,
      "total_flos": 671045690327040.0,
      "train_loss": 0.0,
      "train_runtime": 7.8738,
      "train_samples_per_second": 2427.029,
      "train_steps_per_second": 5.08
    }
  ],
  "logging_steps": 1,
  "max_steps": 40,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 671045690327040.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}