{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 9.90625,
  "eval_steps": 500,
  "global_step": 951,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.10416666666666667,
      "grad_norm": 2.861802577972412,
      "learning_rate": 4.166666666666667e-05,
      "loss": 0.9682,
      "step": 10
    },
    {
      "epoch": 0.20833333333333334,
      "grad_norm": 3.0431478023529053,
      "learning_rate": 8.333333333333334e-05,
      "loss": 0.3898,
      "step": 20
    },
    {
      "epoch": 0.3125,
      "grad_norm": 1.3093934059143066,
      "learning_rate": 0.000125,
      "loss": 0.2177,
      "step": 30
    },
    {
      "epoch": 0.4166666666666667,
      "grad_norm": 0.8621488809585571,
      "learning_rate": 0.0001666666666666667,
      "loss": 0.1536,
      "step": 40
    },
    {
      "epoch": 0.5208333333333334,
      "grad_norm": 0.9496772289276123,
      "learning_rate": 0.00019999757923579923,
      "loss": 0.1156,
      "step": 50
    },
    {
      "epoch": 0.625,
      "grad_norm": 1.193084716796875,
      "learning_rate": 0.00019991286479434454,
      "loss": 0.1079,
      "step": 60
    },
    {
      "epoch": 0.7291666666666666,
      "grad_norm": 0.9372045397758484,
      "learning_rate": 0.00019970722931933287,
      "loss": 0.0979,
      "step": 70
    },
    {
      "epoch": 0.8333333333333334,
      "grad_norm": 1.1392306089401245,
      "learning_rate": 0.0001993809216841623,
      "loss": 0.0873,
      "step": 80
    },
    {
      "epoch": 0.9375,
      "grad_norm": 0.5865321159362793,
      "learning_rate": 0.00019893433680751103,
      "loss": 0.0788,
      "step": 90
    },
    {
      "epoch": 1.0416666666666667,
      "grad_norm": 0.41245004534721375,
      "learning_rate": 0.00019836801517538125,
      "loss": 0.0613,
      "step": 100
    },
    {
      "epoch": 1.1458333333333333,
      "grad_norm": 0.5878641605377197,
      "learning_rate": 0.00019768264218696772,
      "loss": 0.0642,
      "step": 110
    },
    {
      "epoch": 1.25,
      "grad_norm": 0.4363374710083008,
      "learning_rate": 0.0001968790473251434,
      "loss": 0.0544,
      "step": 120
    },
    {
      "epoch": 1.3541666666666667,
      "grad_norm": 0.4858635663986206,
      "learning_rate": 0.0001959582031525653,
      "loss": 0.0524,
      "step": 130
    },
    {
      "epoch": 1.4583333333333333,
      "grad_norm": 0.6191464066505432,
      "learning_rate": 0.00019492122413461603,
      "loss": 0.0555,
      "step": 140
    },
    {
      "epoch": 1.5625,
      "grad_norm": 0.3185575008392334,
      "learning_rate": 0.00019376936529060554,
      "loss": 0.0494,
      "step": 150
    },
    {
      "epoch": 1.6666666666666665,
      "grad_norm": 0.266720175743103,
      "learning_rate": 0.00019250402067486522,
      "loss": 0.0474,
      "step": 160
    },
    {
      "epoch": 1.7708333333333335,
      "grad_norm": 0.24431076645851135,
      "learning_rate": 0.00019112672168957292,
      "loss": 0.0462,
      "step": 170
    },
    {
      "epoch": 1.875,
      "grad_norm": 0.2719191014766693,
      "learning_rate": 0.0001896391352313506,
      "loss": 0.0395,
      "step": 180
    },
    {
      "epoch": 1.9791666666666665,
      "grad_norm": 0.40286824107170105,
      "learning_rate": 0.00018804306167387796,
      "loss": 0.0449,
      "step": 190
    },
    {
      "epoch": 2.0833333333333335,
      "grad_norm": 0.4218553304672241,
      "learning_rate": 0.00018634043268896323,
      "loss": 0.0361,
      "step": 200
    },
    {
      "epoch": 2.1875,
      "grad_norm": 0.40750882029533386,
      "learning_rate": 0.00018453330890870855,
      "loss": 0.0418,
      "step": 210
    },
    {
      "epoch": 2.2916666666666665,
      "grad_norm": 1.9637465476989746,
      "learning_rate": 0.0001826238774315995,
      "loss": 0.0925,
      "step": 220
    },
    {
      "epoch": 2.3958333333333335,
      "grad_norm": 0.2965734004974365,
      "learning_rate": 0.00018061444917553629,
      "loss": 0.052,
      "step": 230
    },
    {
      "epoch": 2.5,
      "grad_norm": 0.4165923297405243,
      "learning_rate": 0.0001785074560810111,
      "loss": 0.0512,
      "step": 240
    },
    {
      "epoch": 2.6041666666666665,
      "grad_norm": 0.44070684909820557,
      "learning_rate": 0.00017630544816781577,
      "loss": 0.0418,
      "step": 250
    },
    {
      "epoch": 2.7083333333333335,
      "grad_norm": 0.3902498185634613,
      "learning_rate": 0.00017401109044884246,
      "loss": 0.0423,
      "step": 260
    },
    {
      "epoch": 2.8125,
      "grad_norm": 0.458552747964859,
      "learning_rate": 0.0001716271597047119,
      "loss": 0.0398,
      "step": 270
    },
    {
      "epoch": 2.9166666666666665,
      "grad_norm": 0.3544536530971527,
      "learning_rate": 0.00016915654112313345,
      "loss": 0.0376,
      "step": 280
    },
    {
      "epoch": 3.0208333333333335,
      "grad_norm": 0.5818161368370056,
      "learning_rate": 0.00016660222480706355,
      "loss": 0.0447,
      "step": 290
    },
    {
      "epoch": 3.125,
      "grad_norm": 0.35342398285865784,
      "learning_rate": 0.00016396730215588915,
      "loss": 0.0401,
      "step": 300
    },
    {
      "epoch": 3.2291666666666665,
      "grad_norm": 0.28917449712753296,
      "learning_rate": 0.0001612549621240154,
      "loss": 0.0447,
      "step": 310
    },
    {
      "epoch": 3.3333333333333335,
      "grad_norm": 0.34957313537597656,
      "learning_rate": 0.00015846848736138623,
      "loss": 0.034,
      "step": 320
    },
    {
      "epoch": 3.4375,
      "grad_norm": 0.2229030877351761,
      "learning_rate": 0.00015561125024060826,
      "loss": 0.0351,
      "step": 330
    },
    {
      "epoch": 3.5416666666666665,
      "grad_norm": 0.1731082648038864,
      "learning_rate": 0.00015268670877548648,
      "loss": 0.0369,
      "step": 340
    },
    {
      "epoch": 3.6458333333333335,
      "grad_norm": 0.33026209473609924,
      "learning_rate": 0.00014969840243591177,
      "loss": 0.0324,
      "step": 350
    },
    {
      "epoch": 3.75,
      "grad_norm": 0.22994904220104218,
      "learning_rate": 0.0001466499478641644,
      "loss": 0.0377,
      "step": 360
    },
    {
      "epoch": 3.8541666666666665,
      "grad_norm": 0.25043389201164246,
      "learning_rate": 0.00014354503449781912,
      "loss": 0.0334,
      "step": 370
    },
    {
      "epoch": 3.9583333333333335,
      "grad_norm": 0.26702672243118286,
      "learning_rate": 0.00014038742010454814,
      "loss": 0.0311,
      "step": 380
    },
    {
      "epoch": 4.0625,
      "grad_norm": 0.28817203640937805,
      "learning_rate": 0.00013718092623422686,
      "loss": 0.0339,
      "step": 390
    },
    {
      "epoch": 4.166666666666667,
      "grad_norm": 0.32753250002861023,
      "learning_rate": 0.00013392943359384624,
      "loss": 0.0313,
      "step": 400
    },
    {
      "epoch": 4.270833333333333,
      "grad_norm": 0.26729685068130493,
      "learning_rate": 0.00013063687735082933,
      "loss": 0.0354,
      "step": 410
    },
    {
      "epoch": 4.375,
      "grad_norm": 0.25145024061203003,
      "learning_rate": 0.00012730724237043615,
      "loss": 0.0316,
      "step": 420
    },
    {
      "epoch": 4.479166666666667,
      "grad_norm": 0.21019020676612854,
      "learning_rate": 0.00012394455839302113,
      "loss": 0.0341,
      "step": 430
    },
    {
      "epoch": 4.583333333333333,
      "grad_norm": 0.25795239210128784,
      "learning_rate": 0.00012055289515698007,
      "loss": 0.0309,
      "step": 440
    },
    {
      "epoch": 4.6875,
      "grad_norm": 0.34662681818008423,
      "learning_rate": 0.00011713635747328818,
      "loss": 0.0274,
      "step": 450
    },
    {
      "epoch": 4.791666666666667,
      "grad_norm": 0.26628291606903076,
      "learning_rate": 0.00011369908025759167,
      "loss": 0.028,
      "step": 460
    },
    {
      "epoch": 4.895833333333333,
      "grad_norm": 0.38035184144973755,
      "learning_rate": 0.00011024522352586452,
      "loss": 0.0273,
      "step": 470
    },
    {
      "epoch": 5.0,
      "grad_norm": 0.515480637550354,
      "learning_rate": 0.00010677896735968693,
      "loss": 0.028,
      "step": 480
    },
    {
      "epoch": 5.104166666666667,
      "grad_norm": 0.2834020256996155,
      "learning_rate": 0.00010330450684723955,
      "loss": 0.0241,
      "step": 490
    },
    {
      "epoch": 5.208333333333333,
      "grad_norm": 0.2605891823768616,
      "learning_rate": 9.982604700613529e-05,
      "loss": 0.0285,
      "step": 500
    },
    {
      "epoch": 5.3125,
      "grad_norm": 0.20520137250423431,
      "learning_rate": 9.63477976942341e-05,
      "loss": 0.0233,
      "step": 510
    },
    {
      "epoch": 5.416666666666667,
      "grad_norm": 0.34367069602012634,
      "learning_rate": 9.287396851460008e-05,
      "loss": 0.0248,
      "step": 520
    },
    {
      "epoch": 5.520833333333333,
      "grad_norm": 0.17647576332092285,
      "learning_rate": 8.940876372076603e-05,
      "loss": 0.0249,
      "step": 530
    },
    {
      "epoch": 5.625,
      "grad_norm": 0.18092995882034302,
      "learning_rate": 8.595637712847358e-05,
      "loss": 0.0299,
      "step": 540
    },
    {
      "epoch": 5.729166666666667,
      "grad_norm": 0.4016890823841095,
      "learning_rate": 8.252098704004479e-05,
      "loss": 0.0221,
      "step": 550
    },
    {
      "epoch": 5.833333333333333,
      "grad_norm": 0.20710507035255432,
      "learning_rate": 7.910675118752977e-05,
      "loss": 0.0227,
      "step": 560
    },
    {
      "epoch": 5.9375,
      "grad_norm": 0.16734477877616882,
      "learning_rate": 7.57178017007492e-05,
      "loss": 0.0272,
      "step": 570
    },
    {
      "epoch": 6.041666666666667,
      "grad_norm": 0.25709211826324463,
      "learning_rate": 7.235824010632283e-05,
      "loss": 0.0262,
      "step": 580
    },
    {
      "epoch": 6.145833333333333,
      "grad_norm": 0.20443040132522583,
      "learning_rate": 6.903213236373591e-05,
      "loss": 0.0248,
      "step": 590
    },
    {
      "epoch": 6.25,
      "grad_norm": 0.33397573232650757,
      "learning_rate": 6.574350394445074e-05,
      "loss": 0.0232,
      "step": 600
    },
    {
      "epoch": 6.354166666666667,
      "grad_norm": 0.29977497458457947,
      "learning_rate": 6.249633496002016e-05,
      "loss": 0.0259,
      "step": 610
    },
    {
      "epoch": 6.458333333333333,
      "grad_norm": 0.2301492542028427,
      "learning_rate": 5.929455534509818e-05,
      "loss": 0.0225,
      "step": 620
    },
    {
      "epoch": 6.5625,
      "grad_norm": 0.17564110457897186,
      "learning_rate": 5.614204010117785e-05,
      "loss": 0.0242,
      "step": 630
    },
    {
      "epoch": 6.666666666666667,
      "grad_norm": 0.22045820951461792,
      "learning_rate": 5.304260460681309e-05,
      "loss": 0.0241,
      "step": 640
    },
    {
      "epoch": 6.770833333333333,
      "grad_norm": 0.19701404869556427,
      "learning_rate": 5.000000000000002e-05,
      "loss": 0.0227,
      "step": 650
    },
    {
      "epoch": 6.875,
      "grad_norm": 0.16471460461616516,
      "learning_rate": 4.7017908638305995e-05,
      "loss": 0.0232,
      "step": 660
    },
    {
      "epoch": 6.979166666666667,
      "grad_norm": 0.19613026082515717,
      "learning_rate": 4.4099939642241795e-05,
      "loss": 0.0231,
      "step": 670
    },
    {
      "epoch": 7.083333333333333,
      "grad_norm": 0.2281058430671692,
      "learning_rate": 4.124962452726969e-05,
      "loss": 0.0194,
      "step": 680
    },
    {
      "epoch": 7.1875,
      "grad_norm": 0.33613333106040955,
      "learning_rate": 3.84704129297339e-05,
      "loss": 0.0189,
      "step": 690
    },
    {
      "epoch": 7.291666666666667,
      "grad_norm": 0.11981873214244843,
      "learning_rate": 3.576566843188729e-05,
      "loss": 0.0193,
      "step": 700
    },
    {
      "epoch": 7.395833333333333,
      "grad_norm": 0.14438898861408234,
      "learning_rate": 3.313866449106555e-05,
      "loss": 0.0195,
      "step": 710
    },
    {
      "epoch": 7.5,
      "grad_norm": 0.157650426030159,
      "learning_rate": 3.059258047793661e-05,
      "loss": 0.0213,
      "step": 720
    },
    {
      "epoch": 7.604166666666667,
      "grad_norm": 0.19574569165706635,
      "learning_rate": 2.8130497828620128e-05,
      "loss": 0.0213,
      "step": 730
    },
    {
      "epoch": 7.708333333333333,
      "grad_norm": 0.18361669778823853,
      "learning_rate": 2.5755396315333324e-05,
      "loss": 0.0195,
      "step": 740
    },
    {
      "epoch": 7.8125,
      "grad_norm": 0.17926767468452454,
      "learning_rate": 2.3470150440077266e-05,
      "loss": 0.0214,
      "step": 750
    },
    {
      "epoch": 7.916666666666667,
      "grad_norm": 0.14856334030628204,
      "learning_rate": 2.1277525955728138e-05,
      "loss": 0.0214,
      "step": 760
    },
    {
      "epoch": 8.020833333333334,
      "grad_norm": 0.2279294729232788,
      "learning_rate": 1.9180176518743476e-05,
      "loss": 0.0196,
      "step": 770
    },
    {
      "epoch": 8.125,
      "grad_norm": 0.17617039382457733,
      "learning_rate": 1.7180640477534847e-05,
      "loss": 0.0216,
      "step": 780
    },
    {
      "epoch": 8.229166666666666,
      "grad_norm": 0.09398578852415085,
      "learning_rate": 1.5281337800393968e-05,
      "loss": 0.0186,
      "step": 790
    },
    {
      "epoch": 8.333333333333334,
      "grad_norm": 0.14784985780715942,
      "learning_rate": 1.3484567146690009e-05,
      "loss": 0.018,
      "step": 800
    },
    {
      "epoch": 8.4375,
      "grad_norm": 0.18601654469966888,
      "learning_rate": 1.1792503084882789e-05,
      "loss": 0.0187,
      "step": 810
    },
    {
      "epoch": 8.541666666666666,
      "grad_norm": 0.22347238659858704,
      "learning_rate": 1.0207193460718856e-05,
      "loss": 0.0171,
      "step": 820
    },
    {
      "epoch": 8.645833333333334,
      "grad_norm": 0.16696269810199738,
      "learning_rate": 8.730556918795785e-06,
      "loss": 0.0171,
      "step": 830
    },
    {
      "epoch": 8.75,
      "grad_norm": 0.2363879382610321,
      "learning_rate": 7.364380580493813e-06,
      "loss": 0.0175,
      "step": 840
    },
    {
      "epoch": 8.854166666666666,
      "grad_norm": 0.14943011105060577,
      "learning_rate": 6.1103178810856364e-06,
      "loss": 0.0208,
      "step": 850
    },
    {
      "epoch": 8.958333333333334,
      "grad_norm": 0.18391437828540802,
      "learning_rate": 4.969886568641757e-06,
      "loss": 0.0172,
      "step": 860
    },
    {
      "epoch": 9.0625,
      "grad_norm": 0.17471212148666382,
      "learning_rate": 3.944466867153218e-06,
      "loss": 0.0199,
      "step": 870
    },
    {
      "epoch": 9.166666666666666,
      "grad_norm": 0.2174932211637497,
      "learning_rate": 3.0352998060949155e-06,
      "loss": 0.0173,
      "step": 880
    },
    {
      "epoch": 9.270833333333334,
      "grad_norm": 0.15788139402866364,
      "learning_rate": 2.2434857184512435e-06,
      "loss": 0.0188,
      "step": 890
    },
    {
      "epoch": 9.375,
      "grad_norm": 0.12695789337158203,
      "learning_rate": 1.5699829090217278e-06,
      "loss": 0.0183,
      "step": 900
    },
    {
      "epoch": 9.479166666666666,
      "grad_norm": 0.21467889845371246,
      "learning_rate": 1.0156064946182376e-06,
      "loss": 0.0213,
      "step": 910
    },
    {
      "epoch": 9.583333333333334,
      "grad_norm": 0.11011941730976105,
      "learning_rate": 5.810274175578445e-07,
      "loss": 0.0165,
      "step": 920
    },
    {
      "epoch": 9.6875,
      "grad_norm": 0.1231321394443512,
      "learning_rate": 2.667716336448356e-07,
      "loss": 0.0153,
      "step": 930
    },
    {
      "epoch": 9.791666666666666,
      "grad_norm": 0.23924382030963898,
      "learning_rate": 7.321947562484166e-08,
      "loss": 0.0176,
      "step": 940
    },
    {
      "epoch": 9.895833333333334,
      "grad_norm": 0.1052466407418251,
      "learning_rate": 6.051928814865271e-10,
      "loss": 0.0171,
      "step": 950
    },
    {
      "epoch": 9.90625,
      "step": 951,
      "total_flos": 1.2838499503164576e+17,
      "train_loss": 0.05094248948170246,
      "train_runtime": 1084.1029,
      "train_samples_per_second": 56.142,
      "train_steps_per_second": 0.877
    }
  ],
  "logging_steps": 10,
  "max_steps": 951,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 10000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.2838499503164576e+17,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}