{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.0384,
  "eval_steps": 1000,
  "global_step": 600,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.00064,
      "grad_norm": 1.6144306659698486,
      "learning_rate": 1.1520000000000002e-08,
      "loss": 0.729,
      "step": 10
    },
    {
      "epoch": 0.00128,
      "grad_norm": 2.0952296257019043,
      "learning_rate": 2.4320000000000002e-08,
      "loss": 0.7295,
      "step": 20
    },
    {
      "epoch": 0.00192,
      "grad_norm": 1.3587689399719238,
      "learning_rate": 3.7120000000000004e-08,
      "loss": 0.73,
      "step": 30
    },
    {
      "epoch": 0.00256,
      "grad_norm": 1.2531732320785522,
      "learning_rate": 4.9920000000000006e-08,
      "loss": 0.7221,
      "step": 40
    },
    {
      "epoch": 0.0032,
      "grad_norm": 1.437932014465332,
      "learning_rate": 6.272000000000001e-08,
      "loss": 0.7209,
      "step": 50
    },
    {
      "epoch": 0.00384,
      "grad_norm": 1.418426752090454,
      "learning_rate": 7.552e-08,
      "loss": 0.729,
      "step": 60
    },
    {
      "epoch": 0.00448,
      "grad_norm": 1.9476298093795776,
      "learning_rate": 8.832e-08,
      "loss": 0.7242,
      "step": 70
    },
    {
      "epoch": 0.00512,
      "grad_norm": 1.7948051691055298,
      "learning_rate": 1.0112000000000001e-07,
      "loss": 0.7227,
      "step": 80
    },
    {
      "epoch": 0.00576,
      "grad_norm": 1.6534360647201538,
      "learning_rate": 1.1392e-07,
      "loss": 0.7234,
      "step": 90
    },
    {
      "epoch": 0.0064,
      "grad_norm": 1.0920158624649048,
      "learning_rate": 1.2672e-07,
      "loss": 0.7328,
      "step": 100
    },
    {
      "epoch": 0.00704,
      "grad_norm": 1.977837085723877,
      "learning_rate": 1.3952000000000002e-07,
      "loss": 0.7263,
      "step": 110
    },
    {
      "epoch": 0.00768,
      "grad_norm": 1.388983130455017,
      "learning_rate": 1.5232000000000003e-07,
      "loss": 0.7286,
      "step": 120
    },
    {
      "epoch": 0.00832,
      "grad_norm": 1.2956682443618774,
      "learning_rate": 1.6512e-07,
      "loss": 0.7251,
      "step": 130
    },
    {
      "epoch": 0.00896,
      "grad_norm": 1.8125052452087402,
      "learning_rate": 1.7792e-07,
      "loss": 0.7251,
      "step": 140
    },
    {
      "epoch": 0.0096,
      "grad_norm": 1.626846194267273,
      "learning_rate": 1.9072e-07,
      "loss": 0.727,
      "step": 150
    },
    {
      "epoch": 0.01024,
      "grad_norm": 2.3243086338043213,
      "learning_rate": 2.0352e-07,
      "loss": 0.726,
      "step": 160
    },
    {
      "epoch": 0.01088,
      "grad_norm": 1.4734737873077393,
      "learning_rate": 2.1632e-07,
      "loss": 0.7252,
      "step": 170
    },
    {
      "epoch": 0.01152,
      "grad_norm": 2.090498685836792,
      "learning_rate": 2.2912e-07,
      "loss": 0.7273,
      "step": 180
    },
    {
      "epoch": 0.01216,
      "grad_norm": 1.7563093900680542,
      "learning_rate": 2.4192000000000004e-07,
      "loss": 0.719,
      "step": 190
    },
    {
      "epoch": 0.0128,
      "grad_norm": 1.449843168258667,
      "learning_rate": 2.5472000000000005e-07,
      "loss": 0.7237,
      "step": 200
    },
    {
      "epoch": 0.01344,
      "grad_norm": 2.1326472759246826,
      "learning_rate": 2.6752000000000006e-07,
      "loss": 0.7305,
      "step": 210
    },
    {
      "epoch": 0.01408,
      "grad_norm": 2.21703839302063,
      "learning_rate": 2.8032e-07,
      "loss": 0.7167,
      "step": 220
    },
    {
      "epoch": 0.01472,
      "grad_norm": 1.6385700702667236,
      "learning_rate": 2.9312e-07,
      "loss": 0.7209,
      "step": 230
    },
    {
      "epoch": 0.01536,
      "grad_norm": 1.4293471574783325,
      "learning_rate": 3.0592000000000003e-07,
      "loss": 0.722,
      "step": 240
    },
    {
      "epoch": 0.016,
      "grad_norm": 2.1437904834747314,
      "learning_rate": 3.1872e-07,
      "loss": 0.717,
      "step": 250
    },
    {
      "epoch": 0.01664,
      "grad_norm": 2.014806032180786,
      "learning_rate": 3.3152000000000005e-07,
      "loss": 0.7182,
      "step": 260
    },
    {
      "epoch": 0.01728,
      "grad_norm": 1.7216386795043945,
      "learning_rate": 3.4432e-07,
      "loss": 0.7253,
      "step": 270
    },
    {
      "epoch": 0.01792,
      "grad_norm": 1.4267009496688843,
      "learning_rate": 3.5712e-07,
      "loss": 0.7189,
      "step": 280
    },
    {
      "epoch": 0.01856,
      "grad_norm": 2.222503185272217,
      "learning_rate": 3.6992e-07,
      "loss": 0.7198,
      "step": 290
    },
    {
      "epoch": 0.0192,
      "grad_norm": 1.578922986984253,
      "learning_rate": 3.8272000000000003e-07,
      "loss": 0.717,
      "step": 300
    },
    {
      "epoch": 0.01984,
      "grad_norm": 1.719905972480774,
      "learning_rate": 3.9552e-07,
      "loss": 0.709,
      "step": 310
    },
    {
      "epoch": 0.02048,
      "grad_norm": 1.4473963975906372,
      "learning_rate": 4.0832000000000005e-07,
      "loss": 0.7215,
      "step": 320
    },
    {
      "epoch": 0.02112,
      "grad_norm": 2.1639790534973145,
      "learning_rate": 4.2112e-07,
      "loss": 0.7175,
      "step": 330
    },
    {
      "epoch": 0.02176,
      "grad_norm": 1.2387958765029907,
      "learning_rate": 4.3392e-07,
      "loss": 0.7129,
      "step": 340
    },
    {
      "epoch": 0.0224,
      "grad_norm": 2.2797842025756836,
      "learning_rate": 4.4672000000000007e-07,
      "loss": 0.7159,
      "step": 350
    },
    {
      "epoch": 0.02304,
      "grad_norm": 1.5692473649978638,
      "learning_rate": 4.5952000000000003e-07,
      "loss": 0.7161,
      "step": 360
    },
    {
      "epoch": 0.02368,
      "grad_norm": 1.4270817041397095,
      "learning_rate": 4.723200000000001e-07,
      "loss": 0.7114,
      "step": 370
    },
    {
      "epoch": 0.02432,
      "grad_norm": 1.4091335535049438,
      "learning_rate": 4.8512e-07,
      "loss": 0.7127,
      "step": 380
    },
    {
      "epoch": 0.02496,
      "grad_norm": 1.8862844705581665,
      "learning_rate": 4.979200000000001e-07,
      "loss": 0.7153,
      "step": 390
    },
    {
      "epoch": 0.0256,
      "grad_norm": 1.9264376163482666,
      "learning_rate": 5.107200000000001e-07,
      "loss": 0.7109,
      "step": 400
    },
    {
      "epoch": 0.02624,
      "grad_norm": 1.4058727025985718,
      "learning_rate": 5.235200000000001e-07,
      "loss": 0.705,
      "step": 410
    },
    {
      "epoch": 0.02688,
      "grad_norm": 1.519445776939392,
      "learning_rate": 5.363200000000001e-07,
      "loss": 0.7131,
      "step": 420
    },
    {
      "epoch": 0.02752,
      "grad_norm": 1.6636698246002197,
      "learning_rate": 5.491200000000001e-07,
      "loss": 0.6916,
      "step": 430
    },
    {
      "epoch": 0.02816,
      "grad_norm": 1.5472590923309326,
      "learning_rate": 5.6192e-07,
      "loss": 0.705,
      "step": 440
    },
    {
      "epoch": 0.0288,
      "grad_norm": 1.4896206855773926,
      "learning_rate": 5.747200000000001e-07,
      "loss": 0.7046,
      "step": 450
    },
    {
      "epoch": 0.02944,
      "grad_norm": 2.2565503120422363,
      "learning_rate": 5.8752e-07,
      "loss": 0.7009,
      "step": 460
    },
    {
      "epoch": 0.03008,
      "grad_norm": 2.017638683319092,
      "learning_rate": 6.0032e-07,
      "loss": 0.7058,
      "step": 470
    },
    {
      "epoch": 0.03072,
      "grad_norm": 1.3399696350097656,
      "learning_rate": 6.1312e-07,
      "loss": 0.7003,
      "step": 480
    },
    {
      "epoch": 0.03136,
      "grad_norm": 1.3090866804122925,
      "learning_rate": 6.2592e-07,
      "loss": 0.7067,
      "step": 490
    },
    {
      "epoch": 0.032,
      "grad_norm": 1.4199142456054688,
      "learning_rate": 6.3872e-07,
      "loss": 0.7008,
      "step": 500
    },
    {
      "epoch": 0.03264,
      "grad_norm": 1.7174904346466064,
      "learning_rate": 6.515200000000001e-07,
      "loss": 0.7003,
      "step": 510
    },
    {
      "epoch": 0.03328,
      "grad_norm": 1.2983943223953247,
      "learning_rate": 6.643200000000001e-07,
      "loss": 0.698,
      "step": 520
    },
    {
      "epoch": 0.03392,
      "grad_norm": 1.8224154710769653,
      "learning_rate": 6.7712e-07,
      "loss": 0.7047,
      "step": 530
    },
    {
      "epoch": 0.03456,
      "grad_norm": 1.3605278730392456,
      "learning_rate": 6.899200000000001e-07,
      "loss": 0.6974,
      "step": 540
    },
    {
      "epoch": 0.0352,
      "grad_norm": 1.4932376146316528,
      "learning_rate": 7.027200000000001e-07,
      "loss": 0.6918,
      "step": 550
    },
    {
      "epoch": 0.03584,
      "grad_norm": 1.2169368267059326,
      "learning_rate": 7.155200000000001e-07,
      "loss": 0.6996,
      "step": 560
    },
    {
      "epoch": 0.03648,
      "grad_norm": 1.5690464973449707,
      "learning_rate": 7.2832e-07,
      "loss": 0.6942,
      "step": 570
    },
    {
      "epoch": 0.03712,
      "grad_norm": 1.541991949081421,
      "learning_rate": 7.4112e-07,
      "loss": 0.6973,
      "step": 580
    },
    {
      "epoch": 0.03776,
      "grad_norm": 1.7749661207199097,
      "learning_rate": 7.5392e-07,
      "loss": 0.6865,
      "step": 590
    },
    {
      "epoch": 0.0384,
      "grad_norm": 1.2169281244277954,
      "learning_rate": 7.667200000000001e-07,
      "loss": 0.6876,
      "step": 600
    }
  ],
  "logging_steps": 10,
  "max_steps": 156250,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 200,
  "stateful_callbacks": {
    "EarlyStoppingCallback": {
      "args": {
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.0
      },
      "attributes": {
        "early_stopping_patience_counter": 0
      }
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 5051732262912000.0,
  "train_batch_size": 32,
  "trial_name": null,
  "trial_params": null
}