{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 10.0,
  "eval_steps": 500,
  "global_step": 590,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.08547008547008547,
      "grad_norm": 4.597737789154053,
      "learning_rate": 8e-05,
      "loss": 3.144,
      "step": 5
    },
    {
      "epoch": 0.17094017094017094,
      "grad_norm": 3.6257293224334717,
      "learning_rate": 0.00018,
      "loss": 1.023,
      "step": 10
    },
    {
      "epoch": 0.2564102564102564,
      "grad_norm": 0.9387032389640808,
      "learning_rate": 0.00019862068965517243,
      "loss": 0.3362,
      "step": 15
    },
    {
      "epoch": 0.3418803418803419,
      "grad_norm": 0.41102728247642517,
      "learning_rate": 0.00019689655172413795,
      "loss": 0.1727,
      "step": 20
    },
    {
      "epoch": 0.42735042735042733,
      "grad_norm": 0.504966676235199,
      "learning_rate": 0.00019517241379310345,
      "loss": 0.168,
      "step": 25
    },
    {
      "epoch": 0.5128205128205128,
      "grad_norm": 0.43475794792175293,
      "learning_rate": 0.00019344827586206898,
      "loss": 0.1699,
      "step": 30
    },
    {
      "epoch": 0.5982905982905983,
      "grad_norm": 0.5228849649429321,
      "learning_rate": 0.0001917241379310345,
      "loss": 0.1454,
      "step": 35
    },
    {
      "epoch": 0.6837606837606838,
      "grad_norm": 0.4822940230369568,
      "learning_rate": 0.00019,
      "loss": 0.206,
      "step": 40
    },
    {
      "epoch": 0.7692307692307693,
      "grad_norm": 0.4049399197101593,
      "learning_rate": 0.00018827586206896554,
      "loss": 0.132,
      "step": 45
    },
    {
      "epoch": 0.8547008547008547,
      "grad_norm": 0.4594310224056244,
      "learning_rate": 0.00018655172413793104,
      "loss": 0.1541,
      "step": 50
    },
    {
      "epoch": 0.9401709401709402,
      "grad_norm": 0.3666519820690155,
      "learning_rate": 0.00018482758620689654,
      "loss": 0.1144,
      "step": 55
    },
    {
      "epoch": 1.017094017094017,
      "grad_norm": 0.16928212344646454,
      "learning_rate": 0.00018310344827586207,
      "loss": 0.1208,
      "step": 60
    },
    {
      "epoch": 1.1025641025641026,
      "grad_norm": 0.09923699498176575,
      "learning_rate": 0.0001813793103448276,
      "loss": 0.0761,
      "step": 65
    },
    {
      "epoch": 1.188034188034188,
      "grad_norm": 0.23626509308815002,
      "learning_rate": 0.0001796551724137931,
      "loss": 0.0786,
      "step": 70
    },
    {
      "epoch": 1.2735042735042734,
      "grad_norm": 0.45735999941825867,
      "learning_rate": 0.00017793103448275862,
      "loss": 0.13,
      "step": 75
    },
    {
      "epoch": 1.358974358974359,
      "grad_norm": 0.24871651828289032,
      "learning_rate": 0.00017620689655172415,
      "loss": 0.0814,
      "step": 80
    },
    {
      "epoch": 1.4444444444444444,
      "grad_norm": 0.21524538099765778,
      "learning_rate": 0.00017448275862068965,
      "loss": 0.0737,
      "step": 85
    },
    {
      "epoch": 1.5299145299145298,
      "grad_norm": 0.4590378701686859,
      "learning_rate": 0.00017275862068965518,
      "loss": 0.0955,
      "step": 90
    },
    {
      "epoch": 1.6153846153846154,
      "grad_norm": 0.7036776542663574,
      "learning_rate": 0.0001710344827586207,
      "loss": 0.0671,
      "step": 95
    },
    {
      "epoch": 1.7008547008547008,
      "grad_norm": 0.26162663102149963,
      "learning_rate": 0.0001693103448275862,
      "loss": 0.0828,
      "step": 100
    },
    {
      "epoch": 1.7863247863247862,
      "grad_norm": 0.4105569124221802,
      "learning_rate": 0.00016758620689655173,
      "loss": 0.0768,
      "step": 105
    },
    {
      "epoch": 1.8717948717948718,
      "grad_norm": 0.3037894666194916,
      "learning_rate": 0.00016586206896551726,
      "loss": 0.1149,
      "step": 110
    },
    {
      "epoch": 1.9572649572649574,
      "grad_norm": 0.19420042634010315,
      "learning_rate": 0.00016413793103448276,
      "loss": 0.0635,
      "step": 115
    },
    {
      "epoch": 2.034188034188034,
      "grad_norm": 0.13855452835559845,
      "learning_rate": 0.0001624137931034483,
      "loss": 0.0594,
      "step": 120
    },
    {
      "epoch": 2.1196581196581197,
      "grad_norm": 0.17749273777008057,
      "learning_rate": 0.00016068965517241382,
      "loss": 0.0725,
      "step": 125
    },
    {
      "epoch": 2.2051282051282053,
      "grad_norm": 0.13107630610466003,
      "learning_rate": 0.00015896551724137932,
      "loss": 0.0619,
      "step": 130
    },
    {
      "epoch": 2.2905982905982905,
      "grad_norm": 0.11133825778961182,
      "learning_rate": 0.00015724137931034485,
      "loss": 0.0624,
      "step": 135
    },
    {
      "epoch": 2.376068376068376,
      "grad_norm": 0.187343031167984,
      "learning_rate": 0.00015551724137931037,
      "loss": 0.0581,
      "step": 140
    },
    {
      "epoch": 2.4615384615384617,
      "grad_norm": 0.27685755491256714,
      "learning_rate": 0.00015379310344827587,
      "loss": 0.0613,
      "step": 145
    },
    {
      "epoch": 2.547008547008547,
      "grad_norm": 0.4320373833179474,
      "learning_rate": 0.0001520689655172414,
      "loss": 0.0735,
      "step": 150
    },
    {
      "epoch": 2.6324786324786325,
      "grad_norm": 0.13862545788288116,
      "learning_rate": 0.0001503448275862069,
      "loss": 0.0582,
      "step": 155
    },
    {
      "epoch": 2.717948717948718,
      "grad_norm": 0.7963452339172363,
      "learning_rate": 0.00014862068965517243,
      "loss": 0.0651,
      "step": 160
    },
    {
      "epoch": 2.8034188034188032,
      "grad_norm": 0.14564156532287598,
      "learning_rate": 0.00014689655172413793,
      "loss": 0.0559,
      "step": 165
    },
    {
      "epoch": 2.888888888888889,
      "grad_norm": 0.15069833397865295,
      "learning_rate": 0.00014517241379310346,
      "loss": 0.0529,
      "step": 170
    },
    {
      "epoch": 2.9743589743589745,
      "grad_norm": 0.3557753562927246,
      "learning_rate": 0.00014344827586206896,
      "loss": 0.0773,
      "step": 175
    },
    {
      "epoch": 3.051282051282051,
      "grad_norm": 0.08716096729040146,
      "learning_rate": 0.0001417241379310345,
      "loss": 0.0513,
      "step": 180
    },
    {
      "epoch": 3.1367521367521367,
      "grad_norm": 0.15282496809959412,
      "learning_rate": 0.00014,
      "loss": 0.0621,
      "step": 185
    },
    {
      "epoch": 3.2222222222222223,
      "grad_norm": 0.09816001355648041,
      "learning_rate": 0.00013827586206896552,
      "loss": 0.0648,
      "step": 190
    },
    {
      "epoch": 3.3076923076923075,
      "grad_norm": 0.13748367130756378,
      "learning_rate": 0.00013655172413793104,
      "loss": 0.0485,
      "step": 195
    },
    {
      "epoch": 3.393162393162393,
      "grad_norm": 0.10656469315290451,
      "learning_rate": 0.00013482758620689654,
      "loss": 0.0531,
      "step": 200
    },
    {
      "epoch": 3.4786324786324787,
      "grad_norm": 0.1901499480009079,
      "learning_rate": 0.00013310344827586207,
      "loss": 0.0612,
      "step": 205
    },
    {
      "epoch": 3.564102564102564,
      "grad_norm": 0.16148889064788818,
      "learning_rate": 0.0001313793103448276,
      "loss": 0.0546,
      "step": 210
    },
    {
      "epoch": 3.6495726495726495,
      "grad_norm": 0.19384047389030457,
      "learning_rate": 0.0001296551724137931,
      "loss": 0.0589,
      "step": 215
    },
    {
      "epoch": 3.735042735042735,
      "grad_norm": 0.08794084936380386,
      "learning_rate": 0.00012793103448275863,
      "loss": 0.0573,
      "step": 220
    },
    {
      "epoch": 3.8205128205128203,
      "grad_norm": 0.10576070100069046,
      "learning_rate": 0.00012620689655172415,
      "loss": 0.0471,
      "step": 225
    },
    {
      "epoch": 3.905982905982906,
      "grad_norm": 0.08111118525266647,
      "learning_rate": 0.00012448275862068966,
      "loss": 0.0572,
      "step": 230
    },
    {
      "epoch": 3.9914529914529915,
      "grad_norm": 0.4230298101902008,
      "learning_rate": 0.00012275862068965518,
      "loss": 0.0617,
      "step": 235
    },
    {
      "epoch": 4.068376068376068,
      "grad_norm": 0.08736063539981842,
      "learning_rate": 0.00012103448275862071,
      "loss": 0.0493,
      "step": 240
    },
    {
      "epoch": 4.153846153846154,
      "grad_norm": 0.06979858875274658,
      "learning_rate": 0.00011931034482758621,
      "loss": 0.0469,
      "step": 245
    },
    {
      "epoch": 4.239316239316239,
      "grad_norm": 0.10242439806461334,
      "learning_rate": 0.00011758620689655173,
      "loss": 0.0508,
      "step": 250
    },
    {
      "epoch": 4.3247863247863245,
      "grad_norm": 0.11685860902070999,
      "learning_rate": 0.00011586206896551725,
      "loss": 0.0522,
      "step": 255
    },
    {
      "epoch": 4.410256410256411,
      "grad_norm": 0.1084512323141098,
      "learning_rate": 0.00011413793103448275,
      "loss": 0.0519,
      "step": 260
    },
    {
      "epoch": 4.495726495726496,
      "grad_norm": 0.09368503093719482,
      "learning_rate": 0.00011241379310344828,
      "loss": 0.0494,
      "step": 265
    },
    {
      "epoch": 4.581196581196581,
      "grad_norm": 0.1777074784040451,
      "learning_rate": 0.00011068965517241381,
      "loss": 0.0515,
      "step": 270
    },
    {
      "epoch": 4.666666666666667,
      "grad_norm": 0.056768111884593964,
      "learning_rate": 0.00010896551724137931,
      "loss": 0.044,
      "step": 275
    },
    {
      "epoch": 4.752136752136752,
      "grad_norm": 0.08062291890382767,
      "learning_rate": 0.00010724137931034484,
      "loss": 0.0476,
      "step": 280
    },
    {
      "epoch": 4.837606837606837,
      "grad_norm": 0.09975454211235046,
      "learning_rate": 0.00010551724137931037,
      "loss": 0.0522,
      "step": 285
    },
    {
      "epoch": 4.923076923076923,
      "grad_norm": 0.14652380347251892,
      "learning_rate": 0.00010379310344827587,
      "loss": 0.0498,
      "step": 290
    },
    {
      "epoch": 5.0,
      "grad_norm": 0.12409216165542603,
      "learning_rate": 0.0001020689655172414,
      "loss": 0.0474,
      "step": 295
    },
    {
      "epoch": 5.085470085470085,
      "grad_norm": 0.09494274109601974,
      "learning_rate": 0.0001003448275862069,
      "loss": 0.0462,
      "step": 300
    },
    {
      "epoch": 5.170940170940171,
      "grad_norm": 0.1240062415599823,
      "learning_rate": 9.862068965517242e-05,
      "loss": 0.0425,
      "step": 305
    },
    {
      "epoch": 5.256410256410256,
      "grad_norm": 0.1713438332080841,
      "learning_rate": 9.689655172413794e-05,
      "loss": 0.0431,
      "step": 310
    },
    {
      "epoch": 5.3418803418803416,
      "grad_norm": 0.1990644931793213,
      "learning_rate": 9.517241379310345e-05,
      "loss": 0.048,
      "step": 315
    },
    {
      "epoch": 5.427350427350428,
      "grad_norm": 0.09711036831140518,
      "learning_rate": 9.344827586206896e-05,
      "loss": 0.0476,
      "step": 320
    },
    {
      "epoch": 5.512820512820513,
      "grad_norm": 0.11504214257001877,
      "learning_rate": 9.172413793103448e-05,
      "loss": 0.0494,
      "step": 325
    },
    {
      "epoch": 5.598290598290598,
      "grad_norm": 0.08380427211523056,
      "learning_rate": 9e-05,
      "loss": 0.047,
      "step": 330
    },
    {
      "epoch": 5.683760683760684,
      "grad_norm": 0.08641541749238968,
      "learning_rate": 8.827586206896552e-05,
      "loss": 0.0457,
      "step": 335
    },
    {
      "epoch": 5.769230769230769,
      "grad_norm": 0.0935196503996849,
      "learning_rate": 8.655172413793103e-05,
      "loss": 0.0489,
      "step": 340
    },
    {
      "epoch": 5.854700854700854,
      "grad_norm": 0.11386577785015106,
      "learning_rate": 8.482758620689656e-05,
      "loss": 0.0479,
      "step": 345
    },
    {
      "epoch": 5.94017094017094,
      "grad_norm": 0.08249244838953018,
      "learning_rate": 8.310344827586208e-05,
      "loss": 0.0469,
      "step": 350
    },
    {
      "epoch": 6.017094017094017,
      "grad_norm": 0.09115161001682281,
      "learning_rate": 8.137931034482759e-05,
      "loss": 0.0455,
      "step": 355
    },
    {
      "epoch": 6.102564102564102,
      "grad_norm": 0.06610054522752762,
      "learning_rate": 7.965517241379312e-05,
      "loss": 0.0432,
      "step": 360
    },
    {
      "epoch": 6.188034188034188,
      "grad_norm": 0.09798604249954224,
      "learning_rate": 7.793103448275862e-05,
      "loss": 0.0442,
      "step": 365
    },
    {
      "epoch": 6.273504273504273,
      "grad_norm": 0.12107487767934799,
      "learning_rate": 7.620689655172413e-05,
      "loss": 0.0418,
      "step": 370
    },
    {
      "epoch": 6.358974358974359,
      "grad_norm": 0.10651250928640366,
      "learning_rate": 7.448275862068966e-05,
      "loss": 0.0437,
      "step": 375
    },
    {
      "epoch": 6.444444444444445,
      "grad_norm": 0.09335967153310776,
      "learning_rate": 7.275862068965517e-05,
      "loss": 0.044,
      "step": 380
    },
    {
      "epoch": 6.52991452991453,
      "grad_norm": 0.10894130915403366,
      "learning_rate": 7.103448275862069e-05,
      "loss": 0.0493,
      "step": 385
    },
    {
      "epoch": 6.615384615384615,
      "grad_norm": 0.09522519260644913,
      "learning_rate": 6.931034482758622e-05,
      "loss": 0.0463,
      "step": 390
    },
    {
      "epoch": 6.700854700854701,
      "grad_norm": 0.09910976886749268,
      "learning_rate": 6.758620689655173e-05,
      "loss": 0.0427,
      "step": 395
    },
    {
      "epoch": 6.786324786324786,
      "grad_norm": 0.11286190897226334,
      "learning_rate": 6.586206896551724e-05,
      "loss": 0.0444,
      "step": 400
    },
    {
      "epoch": 6.871794871794872,
      "grad_norm": 0.07890793681144714,
      "learning_rate": 6.413793103448276e-05,
      "loss": 0.0407,
      "step": 405
    },
    {
      "epoch": 6.957264957264957,
      "grad_norm": 0.08769431710243225,
      "learning_rate": 6.241379310344829e-05,
      "loss": 0.0479,
      "step": 410
    },
    {
      "epoch": 7.034188034188034,
      "grad_norm": 0.06925784051418304,
      "learning_rate": 6.068965517241379e-05,
      "loss": 0.0439,
      "step": 415
    },
    {
      "epoch": 7.119658119658119,
      "grad_norm": 0.08389502763748169,
      "learning_rate": 5.896551724137931e-05,
      "loss": 0.0437,
      "step": 420
    },
    {
      "epoch": 7.205128205128205,
      "grad_norm": 0.10391002893447876,
      "learning_rate": 5.7241379310344835e-05,
      "loss": 0.042,
      "step": 425
    },
    {
      "epoch": 7.2905982905982905,
      "grad_norm": 0.09842480719089508,
      "learning_rate": 5.551724137931035e-05,
      "loss": 0.0407,
      "step": 430
    },
    {
      "epoch": 7.3760683760683765,
      "grad_norm": 0.09367308020591736,
      "learning_rate": 5.379310344827586e-05,
      "loss": 0.0422,
      "step": 435
    },
    {
      "epoch": 7.461538461538462,
      "grad_norm": 0.11631827801465988,
      "learning_rate": 5.2068965517241384e-05,
      "loss": 0.0453,
      "step": 440
    },
    {
      "epoch": 7.547008547008547,
      "grad_norm": 0.13546331226825714,
      "learning_rate": 5.03448275862069e-05,
      "loss": 0.0405,
      "step": 445
    },
    {
      "epoch": 7.632478632478632,
      "grad_norm": 0.1015164852142334,
      "learning_rate": 4.862068965517241e-05,
      "loss": 0.0433,
      "step": 450
    },
    {
      "epoch": 7.717948717948718,
      "grad_norm": 0.12304691225290298,
      "learning_rate": 4.689655172413793e-05,
      "loss": 0.0439,
      "step": 455
    },
    {
      "epoch": 7.803418803418803,
      "grad_norm": 0.11133451014757156,
      "learning_rate": 4.5172413793103454e-05,
      "loss": 0.0404,
      "step": 460
    },
    {
      "epoch": 7.888888888888889,
      "grad_norm": 0.11199292540550232,
      "learning_rate": 4.344827586206897e-05,
      "loss": 0.0401,
      "step": 465
    },
    {
      "epoch": 7.9743589743589745,
      "grad_norm": 0.10854869335889816,
      "learning_rate": 4.172413793103448e-05,
      "loss": 0.047,
      "step": 470
    },
    {
      "epoch": 8.051282051282051,
      "grad_norm": 0.08034314215183258,
      "learning_rate": 4e-05,
      "loss": 0.0372,
      "step": 475
    },
    {
      "epoch": 8.136752136752136,
      "grad_norm": 0.07888869941234589,
      "learning_rate": 3.827586206896552e-05,
      "loss": 0.0374,
      "step": 480
    },
    {
      "epoch": 8.222222222222221,
      "grad_norm": 0.08299173414707184,
      "learning_rate": 3.655172413793104e-05,
      "loss": 0.0415,
      "step": 485
    },
    {
      "epoch": 8.307692307692308,
      "grad_norm": 0.10082942992448807,
      "learning_rate": 3.482758620689655e-05,
      "loss": 0.0431,
      "step": 490
    },
    {
      "epoch": 8.393162393162394,
      "grad_norm": 0.13129588961601257,
      "learning_rate": 3.310344827586207e-05,
      "loss": 0.0381,
      "step": 495
    },
    {
      "epoch": 8.478632478632479,
      "grad_norm": 0.0956198126077652,
      "learning_rate": 3.137931034482759e-05,
      "loss": 0.0391,
      "step": 500
    },
    {
      "epoch": 8.564102564102564,
      "grad_norm": 0.10935048758983612,
      "learning_rate": 2.96551724137931e-05,
      "loss": 0.0415,
      "step": 505
    },
    {
      "epoch": 8.649572649572649,
      "grad_norm": 0.09700857102870941,
      "learning_rate": 2.7931034482758622e-05,
      "loss": 0.042,
      "step": 510
    },
    {
      "epoch": 8.735042735042736,
      "grad_norm": 0.09681924432516098,
      "learning_rate": 2.620689655172414e-05,
      "loss": 0.041,
      "step": 515
    },
    {
      "epoch": 8.820512820512821,
      "grad_norm": 0.10170122236013412,
      "learning_rate": 2.4482758620689654e-05,
      "loss": 0.0404,
      "step": 520
    },
    {
      "epoch": 8.905982905982906,
      "grad_norm": 0.10559462755918503,
      "learning_rate": 2.2758620689655175e-05,
      "loss": 0.0395,
      "step": 525
    },
    {
      "epoch": 8.991452991452991,
      "grad_norm": 0.11863423138856888,
      "learning_rate": 2.1034482758620692e-05,
      "loss": 0.0433,
      "step": 530
    },
    {
      "epoch": 9.068376068376068,
      "grad_norm": 0.0633588433265686,
      "learning_rate": 1.9310344827586207e-05,
      "loss": 0.0383,
      "step": 535
    },
    {
      "epoch": 9.153846153846153,
      "grad_norm": 0.08409127593040466,
      "learning_rate": 1.7586206896551724e-05,
      "loss": 0.038,
      "step": 540
    },
    {
      "epoch": 9.239316239316238,
      "grad_norm": 0.12133090943098068,
      "learning_rate": 1.586206896551724e-05,
      "loss": 0.0366,
      "step": 545
    },
    {
      "epoch": 9.324786324786325,
      "grad_norm": 0.09883731603622437,
      "learning_rate": 1.4137931034482759e-05,
      "loss": 0.0386,
      "step": 550
    },
    {
      "epoch": 9.41025641025641,
      "grad_norm": 0.20076970756053925,
      "learning_rate": 1.2413793103448277e-05,
      "loss": 0.0375,
      "step": 555
    },
    {
      "epoch": 9.495726495726496,
      "grad_norm": 0.103940449655056,
      "learning_rate": 1.0689655172413794e-05,
      "loss": 0.0394,
      "step": 560
    },
    {
      "epoch": 9.581196581196581,
      "grad_norm": 0.09235844761133194,
      "learning_rate": 8.96551724137931e-06,
      "loss": 0.0405,
      "step": 565
    },
    {
      "epoch": 9.666666666666666,
      "grad_norm": 0.07304095476865768,
      "learning_rate": 7.241379310344828e-06,
      "loss": 0.0352,
      "step": 570
    },
    {
      "epoch": 9.752136752136753,
      "grad_norm": 0.12776847183704376,
      "learning_rate": 5.517241379310345e-06,
      "loss": 0.04,
      "step": 575
    },
    {
      "epoch": 9.837606837606838,
      "grad_norm": 0.11009430885314941,
      "learning_rate": 3.793103448275862e-06,
      "loss": 0.0374,
      "step": 580
    },
    {
      "epoch": 9.923076923076923,
      "grad_norm": 0.13841569423675537,
      "learning_rate": 2.0689655172413796e-06,
      "loss": 0.0401,
      "step": 585
    },
    {
      "epoch": 10.0,
      "grad_norm": 0.1534666121006012,
      "learning_rate": 3.4482758620689656e-07,
      "loss": 0.0366,
      "step": 590
    },
    {
      "epoch": 10.0,
      "step": 590,
      "total_flos": 9496524054435840.0,
      "train_loss": 0.09625538042036154,
      "train_runtime": 681.7113,
      "train_samples_per_second": 6.85,
      "train_steps_per_second": 0.865
    }
  ],
  "logging_steps": 5,
  "max_steps": 590,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 9496524054435840.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}