{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.4708236471176765,
  "eval_steps": 500,
  "global_step": 8000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.005885295588970956,
      "grad_norm": 29.875,
      "learning_rate": 3.9215686274509805e-05,
      "loss": 1.0686,
      "step": 100
    },
    {
      "epoch": 0.011770591177941912,
      "grad_norm": 11.375,
      "learning_rate": 7.843137254901961e-05,
      "loss": 0.6153,
      "step": 200
    },
    {
      "epoch": 0.01765588676691287,
      "grad_norm": 20.25,
      "learning_rate": 0.00011764705882352942,
      "loss": 0.5969,
      "step": 300
    },
    {
      "epoch": 0.023541182355883823,
      "grad_norm": 4.5,
      "learning_rate": 0.00015686274509803922,
      "loss": 0.5478,
      "step": 400
    },
    {
      "epoch": 0.02942647794485478,
      "grad_norm": 7.40625,
      "learning_rate": 0.000196078431372549,
      "loss": 0.5599,
      "step": 500
    },
    {
      "epoch": 0.03531177353382574,
      "grad_norm": 2.71875,
      "learning_rate": 0.00019998528443307886,
      "loss": 0.565,
      "step": 600
    },
    {
      "epoch": 0.04119706912279669,
      "grad_norm": 30.875,
      "learning_rate": 0.00019993442136695625,
      "loss": 0.5501,
      "step": 700
    },
    {
      "epoch": 0.047082364711767646,
      "grad_norm": 3.328125,
      "learning_rate": 0.00019984724760441856,
      "loss": 0.5355,
      "step": 800
    },
    {
      "epoch": 0.05296766030073861,
      "grad_norm": 6.03125,
      "learning_rate": 0.00019972379481963764,
      "loss": 0.5344,
      "step": 900
    },
    {
      "epoch": 0.05885295588970956,
      "grad_norm": 29.75,
      "learning_rate": 0.00019956410786859524,
      "loss": 0.5016,
      "step": 1000
    },
    {
      "epoch": 0.06473825147868052,
      "grad_norm": 14.0625,
      "learning_rate": 0.00019936824477278514,
      "loss": 0.5091,
      "step": 1100
    },
    {
      "epoch": 0.07062354706765148,
      "grad_norm": 24.75,
      "learning_rate": 0.00019913627669813103,
      "loss": 0.5005,
      "step": 1200
    },
    {
      "epoch": 0.07650884265662243,
      "grad_norm": 3.203125,
      "learning_rate": 0.00019886828792912894,
      "loss": 0.4961,
      "step": 1300
    },
    {
      "epoch": 0.08239413824559338,
      "grad_norm": 1.875,
      "learning_rate": 0.0001985643758382227,
      "loss": 0.4755,
      "step": 1400
    },
    {
      "epoch": 0.08827943383456434,
      "grad_norm": 7.46875,
      "learning_rate": 0.00019822465085042422,
      "loss": 0.4889,
      "step": 1500
    },
    {
      "epoch": 0.09416472942353529,
      "grad_norm": 3.859375,
      "learning_rate": 0.0001978492364031911,
      "loss": 0.5024,
      "step": 1600
    },
    {
      "epoch": 0.10005002501250625,
      "grad_norm": 14.4375,
      "learning_rate": 0.00019743826890157614,
      "loss": 0.4681,
      "step": 1700
    },
    {
      "epoch": 0.10593532060147721,
      "grad_norm": 10.375,
      "learning_rate": 0.0001969918976686652,
      "loss": 0.488,
      "step": 1800
    },
    {
      "epoch": 0.11182061619044817,
      "grad_norm": 9.5625,
      "learning_rate": 0.00019651028489132147,
      "loss": 0.4859,
      "step": 1900
    },
    {
      "epoch": 0.11770591177941912,
      "grad_norm": 15.125,
      "learning_rate": 0.0001959936055612557,
      "loss": 0.5028,
      "step": 2000
    },
    {
      "epoch": 0.12359120736839008,
      "grad_norm": 12.5625,
      "learning_rate": 0.0001954420474114435,
      "loss": 0.4937,
      "step": 2100
    },
    {
      "epoch": 0.12947650295736104,
      "grad_norm": 3.890625,
      "learning_rate": 0.00019485581084791376,
      "loss": 0.4801,
      "step": 2200
    },
    {
      "epoch": 0.13536179854633199,
      "grad_norm": 19.125,
      "learning_rate": 0.0001942351088769319,
      "loss": 0.4853,
      "step": 2300
    },
    {
      "epoch": 0.14124709413530295,
      "grad_norm": 11.8125,
      "learning_rate": 0.0001935801670276052,
      "loss": 0.4739,
      "step": 2400
    },
    {
      "epoch": 0.1471323897242739,
      "grad_norm": 35.5,
      "learning_rate": 0.00019289122326993777,
      "loss": 0.4868,
      "step": 2500
    },
    {
      "epoch": 0.15301768531324486,
      "grad_norm": 20.875,
      "learning_rate": 0.00019216852792836516,
      "loss": 0.4925,
      "step": 2600
    },
    {
      "epoch": 0.1589029809022158,
      "grad_norm": 12.5625,
      "learning_rate": 0.00019141234359080055,
      "loss": 0.4808,
      "step": 2700
    },
    {
      "epoch": 0.16478827649118677,
      "grad_norm": 8.6875,
      "learning_rate": 0.00019062294501322416,
      "loss": 0.4757,
      "step": 2800
    },
    {
      "epoch": 0.17067357208015774,
      "grad_norm": 20.625,
      "learning_rate": 0.0001898006190198525,
      "loss": 0.4805,
      "step": 2900
    },
    {
      "epoch": 0.17655886766912868,
      "grad_norm": 10.25,
      "learning_rate": 0.0001889456643989218,
      "loss": 0.4832,
      "step": 3000
    },
    {
      "epoch": 0.18244416325809965,
      "grad_norm": 20.25,
      "learning_rate": 0.00018805839179412485,
      "loss": 0.4559,
      "step": 3100
    },
    {
      "epoch": 0.18832945884707059,
      "grad_norm": 8.5625,
      "learning_rate": 0.00018713912359174,
      "loss": 0.497,
      "step": 3200
    },
    {
      "epoch": 0.19421475443604155,
      "grad_norm": 6.40625,
      "learning_rate": 0.00018618819380349382,
      "loss": 0.4776,
      "step": 3300
    },
    {
      "epoch": 0.2001000500250125,
      "grad_norm": 12.8125,
      "learning_rate": 0.00018520594794519941,
      "loss": 0.4915,
      "step": 3400
    },
    {
      "epoch": 0.20598534561398346,
      "grad_norm": 1.84375,
      "learning_rate": 0.00018419274291121485,
      "loss": 0.4498,
      "step": 3500
    },
    {
      "epoch": 0.21187064120295443,
      "grad_norm": 3.8125,
      "learning_rate": 0.00018314894684476736,
      "loss": 0.4625,
      "step": 3600
    },
    {
      "epoch": 0.21775593679192537,
      "grad_norm": 19.125,
      "learning_rate": 0.00018207493900419027,
      "loss": 0.4625,
      "step": 3700
    },
    {
      "epoch": 0.22364123238089634,
      "grad_norm": 11.5,
      "learning_rate": 0.00018097110962512128,
      "loss": 0.4655,
      "step": 3800
    },
    {
      "epoch": 0.22952652796986728,
      "grad_norm": 6.3125,
      "learning_rate": 0.00017983785977871209,
      "loss": 0.4488,
      "step": 3900
    },
    {
      "epoch": 0.23541182355883825,
      "grad_norm": 9.875,
      "learning_rate": 0.00017867560122590125,
      "loss": 0.4441,
      "step": 4000
    },
    {
      "epoch": 0.24129711914780919,
      "grad_norm": 12.875,
      "learning_rate": 0.00017748475626780277,
      "loss": 0.4732,
      "step": 4100
    },
    {
      "epoch": 0.24718241473678015,
      "grad_norm": 4.21875,
      "learning_rate": 0.0001762657575922649,
      "loss": 0.4544,
      "step": 4200
    },
    {
      "epoch": 0.2530677103257511,
      "grad_norm": 3.125,
      "learning_rate": 0.0001750190481166552,
      "loss": 0.4779,
      "step": 4300
    },
    {
      "epoch": 0.2589530059147221,
      "grad_norm": 2.1875,
      "learning_rate": 0.00017374508082692848,
      "loss": 0.4661,
      "step": 4400
    },
    {
      "epoch": 0.26483830150369303,
      "grad_norm": 26.25,
      "learning_rate": 0.0001724443186130367,
      "loss": 0.4916,
      "step": 4500
    },
    {
      "epoch": 0.27072359709266397,
      "grad_norm": 8.125,
      "learning_rate": 0.00017111723410073991,
      "loss": 0.449,
      "step": 4600
    },
    {
      "epoch": 0.2766088926816349,
      "grad_norm": 8.625,
      "learning_rate": 0.00016976430947988007,
      "loss": 0.45,
      "step": 4700
    },
    {
      "epoch": 0.2824941882706059,
      "grad_norm": 3.59375,
      "learning_rate": 0.00016838603632917954,
      "loss": 0.4593,
      "step": 4800
    },
    {
      "epoch": 0.28837948385957685,
      "grad_norm": 6.40625,
      "learning_rate": 0.0001669829154376285,
      "loss": 0.4847,
      "step": 4900
    },
    {
      "epoch": 0.2942647794485478,
      "grad_norm": 13.125,
      "learning_rate": 0.00016555545662252536,
      "loss": 0.4576,
      "step": 5000
    },
    {
      "epoch": 0.3001500750375188,
      "grad_norm": 14.3125,
      "learning_rate": 0.00016410417854423735,
      "loss": 0.4457,
      "step": 5100
    },
    {
      "epoch": 0.3060353706264897,
      "grad_norm": 29.0,
      "learning_rate": 0.00016262960851774752,
      "loss": 0.4972,
      "step": 5200
    },
    {
      "epoch": 0.31192066621546066,
      "grad_norm": 20.75,
      "learning_rate": 0.00016113228232105757,
      "loss": 0.4715,
      "step": 5300
    },
    {
      "epoch": 0.3178059618044316,
      "grad_norm": 22.5,
      "learning_rate": 0.0001596127440005152,
      "loss": 0.4696,
      "step": 5400
    },
    {
      "epoch": 0.3236912573934026,
      "grad_norm": 8.1875,
      "learning_rate": 0.00015807154567313775,
      "loss": 0.4629,
      "step": 5500
    },
    {
      "epoch": 0.32957655298237354,
      "grad_norm": 4.375,
      "learning_rate": 0.0001565092473260029,
      "loss": 0.475,
      "step": 5600
    },
    {
      "epoch": 0.3354618485713445,
      "grad_norm": 13.5,
      "learning_rate": 0.00015492641661278005,
      "loss": 0.4511,
      "step": 5700
    },
    {
      "epoch": 0.3413471441603155,
      "grad_norm": 3.5625,
      "learning_rate": 0.0001533236286474762,
      "loss": 0.4743,
      "step": 5800
    },
    {
      "epoch": 0.3472324397492864,
      "grad_norm": 11.8125,
      "learning_rate": 0.0001517014657954708,
      "loss": 0.4418,
      "step": 5900
    },
    {
      "epoch": 0.35311773533825735,
      "grad_norm": 26.125,
      "learning_rate": 0.00015006051746191626,
      "loss": 0.45,
      "step": 6000
    },
    {
      "epoch": 0.3590030309272283,
      "grad_norm": 15.375,
      "learning_rate": 0.00014840137987758028,
      "loss": 0.4463,
      "step": 6100
    },
    {
      "epoch": 0.3648883265161993,
      "grad_norm": 5.90625,
      "learning_rate": 0.00014672465588220837,
      "loss": 0.4559,
      "step": 6200
    },
    {
      "epoch": 0.37077362210517023,
      "grad_norm": 12.9375,
      "learning_rate": 0.0001450309547054846,
      "loss": 0.4398,
      "step": 6300
    },
    {
      "epoch": 0.37665891769414117,
      "grad_norm": 21.875,
      "learning_rate": 0.00014332089174567126,
      "loss": 0.4454,
      "step": 6400
    },
    {
      "epoch": 0.38254421328311217,
      "grad_norm": 16.875,
      "learning_rate": 0.00014159508834600657,
      "loss": 0.4443,
      "step": 6500
    },
    {
      "epoch": 0.3884295088720831,
      "grad_norm": 34.25,
      "learning_rate": 0.00013985417156894267,
      "loss": 0.4762,
      "step": 6600
    },
    {
      "epoch": 0.39431480446105405,
      "grad_norm": 4.5625,
      "learning_rate": 0.0001380987739683055,
      "loss": 0.4795,
      "step": 6700
    },
    {
      "epoch": 0.400200100050025,
      "grad_norm": 15.75,
      "learning_rate": 0.00013632953335945927,
      "loss": 0.4603,
      "step": 6800
    },
    {
      "epoch": 0.406085395638996,
      "grad_norm": 5.40625,
      "learning_rate": 0.00013454709258755942,
      "loss": 0.4674,
      "step": 6900
    },
    {
      "epoch": 0.4119706912279669,
      "grad_norm": 30.125,
      "learning_rate": 0.00013275209929397775,
      "loss": 0.4595,
      "step": 7000
    },
    {
      "epoch": 0.41785598681693786,
      "grad_norm": 16.875,
      "learning_rate": 0.0001309452056809851,
      "loss": 0.4398,
      "step": 7100
    },
    {
      "epoch": 0.42374128240590886,
      "grad_norm": 5.6875,
      "learning_rate": 0.00012912706827477671,
      "loss": 0.4693,
      "step": 7200
    },
    {
      "epoch": 0.4296265779948798,
      "grad_norm": 17.125,
      "learning_rate": 0.00012729834768692667,
      "loss": 0.4564,
      "step": 7300
    },
    {
      "epoch": 0.43551187358385074,
      "grad_norm": 9.75,
      "learning_rate": 0.00012545970837435756,
      "loss": 0.4732,
      "step": 7400
    },
    {
      "epoch": 0.4413971691728217,
      "grad_norm": 6.3125,
      "learning_rate": 0.00012361181839791357,
      "loss": 0.4647,
      "step": 7500
    },
    {
      "epoch": 0.4472824647617927,
      "grad_norm": 19.0,
      "learning_rate": 0.00012175534917962352,
      "loss": 0.4697,
      "step": 7600
    },
    {
      "epoch": 0.4531677603507636,
      "grad_norm": 19.375,
      "learning_rate": 0.00011989097525874294,
      "loss": 0.4814,
      "step": 7700
    },
    {
      "epoch": 0.45905305593973456,
      "grad_norm": 2.015625,
      "learning_rate": 0.00011801937404666336,
      "loss": 0.4688,
      "step": 7800
    },
    {
      "epoch": 0.46493835152870555,
      "grad_norm": 9.625,
      "learning_rate": 0.00011614122558077828,
      "loss": 0.4665,
      "step": 7900
    },
    {
      "epoch": 0.4708236471176765,
      "grad_norm": 21.875,
      "learning_rate": 0.00011425721227739465,
      "loss": 0.472,
      "step": 8000
    }
  ],
  "logging_steps": 100,
  "max_steps": 16991,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 4000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.253476198349144e+18,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}