{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.536,
  "eval_steps": 500,
  "global_step": 240,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.032,
      "grad_norm": 0.047607421875,
      "learning_rate": 0.0001,
      "loss": 0.3836,
      "step": 5
    },
    {
      "epoch": 0.064,
      "grad_norm": 0.04541015625,
      "learning_rate": 0.0001,
      "loss": 0.2449,
      "step": 10
    },
    {
      "epoch": 0.096,
      "grad_norm": 0.056396484375,
      "learning_rate": 0.0001,
      "loss": 0.1513,
      "step": 15
    },
    {
      "epoch": 0.128,
      "grad_norm": 0.03857421875,
      "learning_rate": 0.0001,
      "loss": 0.0705,
      "step": 20
    },
    {
      "epoch": 0.16,
      "grad_norm": 0.0286865234375,
      "learning_rate": 0.0001,
      "loss": 0.0488,
      "step": 25
    },
    {
      "epoch": 0.192,
      "grad_norm": 0.03173828125,
      "learning_rate": 0.0001,
      "loss": 0.0391,
      "step": 30
    },
    {
      "epoch": 0.224,
      "grad_norm": 0.054931640625,
      "learning_rate": 0.0001,
      "loss": 0.0278,
      "step": 35
    },
    {
      "epoch": 0.256,
      "grad_norm": 0.0888671875,
      "learning_rate": 0.0001,
      "loss": 0.1414,
      "step": 40
    },
    {
      "epoch": 0.288,
      "grad_norm": 0.015625,
      "learning_rate": 0.0001,
      "loss": 0.0371,
      "step": 45
    },
    {
      "epoch": 0.32,
      "grad_norm": 0.0257568359375,
      "learning_rate": 0.0001,
      "loss": 0.0118,
      "step": 50
    },
    {
      "epoch": 0.352,
      "grad_norm": 0.02197265625,
      "learning_rate": 0.0001,
      "loss": 0.0101,
      "step": 55
    },
    {
      "epoch": 0.384,
      "grad_norm": 0.020751953125,
      "learning_rate": 0.0001,
      "loss": 0.0098,
      "step": 60
    },
    {
      "epoch": 0.416,
      "grad_norm": 0.0164794921875,
      "learning_rate": 0.0001,
      "loss": 0.0088,
      "step": 65
    },
    {
      "epoch": 0.448,
      "grad_norm": 0.0120849609375,
      "learning_rate": 0.0001,
      "loss": 0.0086,
      "step": 70
    },
    {
      "epoch": 0.48,
      "grad_norm": 0.0269775390625,
      "learning_rate": 0.0001,
      "loss": 0.0168,
      "step": 75
    },
    {
      "epoch": 0.512,
      "grad_norm": 0.05078125,
      "learning_rate": 0.0001,
      "loss": 0.0572,
      "step": 80
    },
    {
      "epoch": 0.544,
      "grad_norm": 0.031982421875,
      "learning_rate": 0.0001,
      "loss": 0.0092,
      "step": 85
    },
    {
      "epoch": 0.576,
      "grad_norm": 0.0196533203125,
      "learning_rate": 0.0001,
      "loss": 0.0077,
      "step": 90
    },
    {
      "epoch": 0.608,
      "grad_norm": 0.0238037109375,
      "learning_rate": 0.0001,
      "loss": 0.0054,
      "step": 95
    },
    {
      "epoch": 0.64,
      "grad_norm": 0.0108642578125,
      "learning_rate": 0.0001,
      "loss": 0.0043,
      "step": 100
    },
    {
      "epoch": 0.672,
      "grad_norm": 0.0091552734375,
      "learning_rate": 0.0001,
      "loss": 0.004,
      "step": 105
    },
    {
      "epoch": 0.704,
      "grad_norm": 0.01336669921875,
      "learning_rate": 0.0001,
      "loss": 0.0043,
      "step": 110
    },
    {
      "epoch": 0.736,
      "grad_norm": 0.033203125,
      "learning_rate": 0.0001,
      "loss": 0.0122,
      "step": 115
    },
    {
      "epoch": 0.768,
      "grad_norm": 0.0169677734375,
      "learning_rate": 0.0001,
      "loss": 0.0173,
      "step": 120
    },
    {
      "epoch": 0.8,
      "grad_norm": 0.00909423828125,
      "learning_rate": 0.0001,
      "loss": 0.0031,
      "step": 125
    },
    {
      "epoch": 0.832,
      "grad_norm": 0.01171875,
      "learning_rate": 0.0001,
      "loss": 0.0038,
      "step": 130
    },
    {
      "epoch": 0.864,
      "grad_norm": 0.00946044921875,
      "learning_rate": 0.0001,
      "loss": 0.0036,
      "step": 135
    },
    {
      "epoch": 0.896,
      "grad_norm": 0.014892578125,
      "learning_rate": 0.0001,
      "loss": 0.0047,
      "step": 140
    },
    {
      "epoch": 0.928,
      "grad_norm": 0.01239013671875,
      "learning_rate": 0.0001,
      "loss": 0.006,
      "step": 145
    },
    {
      "epoch": 0.96,
      "grad_norm": 0.00982666015625,
      "learning_rate": 0.0001,
      "loss": 0.0032,
      "step": 150
    },
    {
      "epoch": 0.992,
      "grad_norm": 0.01031494140625,
      "learning_rate": 0.0001,
      "loss": 0.0037,
      "step": 155
    },
    {
      "epoch": 1.024,
      "grad_norm": 0.006927490234375,
      "learning_rate": 0.0001,
      "loss": 0.0036,
      "step": 160
    },
    {
      "epoch": 1.056,
      "grad_norm": 0.0084228515625,
      "learning_rate": 0.0001,
      "loss": 0.0017,
      "step": 165
    },
    {
      "epoch": 1.088,
      "grad_norm": 0.005584716796875,
      "learning_rate": 0.0001,
      "loss": 0.0018,
      "step": 170
    },
    {
      "epoch": 1.12,
      "grad_norm": 0.006683349609375,
      "learning_rate": 0.0001,
      "loss": 0.0017,
      "step": 175
    },
    {
      "epoch": 1.152,
      "grad_norm": 0.004486083984375,
      "learning_rate": 0.0001,
      "loss": 0.0016,
      "step": 180
    },
    {
      "epoch": 1.184,
      "grad_norm": 0.0087890625,
      "learning_rate": 0.0001,
      "loss": 0.0026,
      "step": 185
    },
    {
      "epoch": 1.216,
      "grad_norm": 0.0062255859375,
      "learning_rate": 0.0001,
      "loss": 0.0015,
      "step": 190
    },
    {
      "epoch": 1.248,
      "grad_norm": 0.0128173828125,
      "learning_rate": 0.0001,
      "loss": 0.0026,
      "step": 195
    },
    {
      "epoch": 1.28,
      "grad_norm": 0.006683349609375,
      "learning_rate": 0.0001,
      "loss": 0.0039,
      "step": 200
    },
    {
      "epoch": 1.312,
      "grad_norm": 0.00787353515625,
      "learning_rate": 0.0001,
      "loss": 0.0019,
      "step": 205
    },
    {
      "epoch": 1.3439999999999999,
      "grad_norm": 0.0096435546875,
      "learning_rate": 0.0001,
      "loss": 0.0011,
      "step": 210
    },
    {
      "epoch": 1.376,
      "grad_norm": 0.0096435546875,
      "learning_rate": 0.0001,
      "loss": 0.0016,
      "step": 215
    },
    {
      "epoch": 1.408,
      "grad_norm": 0.005859375,
      "learning_rate": 0.0001,
      "loss": 0.0014,
      "step": 220
    },
    {
      "epoch": 1.44,
      "grad_norm": 0.00848388671875,
      "learning_rate": 0.0001,
      "loss": 0.0014,
      "step": 225
    },
    {
      "epoch": 1.472,
      "grad_norm": 0.015625,
      "learning_rate": 0.0001,
      "loss": 0.002,
      "step": 230
    },
    {
      "epoch": 1.504,
      "grad_norm": 0.03857421875,
      "learning_rate": 0.0001,
      "loss": 0.0067,
      "step": 235
    },
    {
      "epoch": 1.536,
      "grad_norm": 0.00811767578125,
      "learning_rate": 0.0001,
      "loss": 0.0062,
      "step": 240
    }
  ],
  "logging_steps": 5,
  "max_steps": 240,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 2,
  "save_steps": 90,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 9.355246833433805e+17,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}