{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 9.89875173370319,
  "eval_steps": 500,
  "global_step": 900,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.11095700416088766,
      "grad_norm": 0.31662145256996155,
      "learning_rate": 7.407407407407407e-05,
      "loss": 0.5604,
      "step": 10
    },
    {
      "epoch": 0.22191400832177532,
      "grad_norm": 0.38665255904197693,
      "learning_rate": 0.00014814814814814815,
      "loss": 0.3448,
      "step": 20
    },
    {
      "epoch": 0.332871012482663,
      "grad_norm": 0.38282278180122375,
      "learning_rate": 0.00019999417253661235,
      "loss": 0.1345,
      "step": 30
    },
    {
      "epoch": 0.44382801664355065,
      "grad_norm": 0.33959391713142395,
      "learning_rate": 0.000199890592080658,
      "loss": 0.1206,
      "step": 40
    },
    {
      "epoch": 0.5547850208044383,
      "grad_norm": 0.2943621873855591,
      "learning_rate": 0.00019965766682369186,
      "loss": 0.1234,
      "step": 50
    },
    {
      "epoch": 0.665742024965326,
      "grad_norm": 0.25359126925468445,
      "learning_rate": 0.00019929569837240564,
      "loss": 0.1039,
      "step": 60
    },
    {
      "epoch": 0.7766990291262136,
      "grad_norm": 0.23930878937244415,
      "learning_rate": 0.0001988051554269675,
      "loss": 0.102,
      "step": 70
    },
    {
      "epoch": 0.8876560332871013,
      "grad_norm": 0.2013150006532669,
      "learning_rate": 0.00019818667317411865,
      "loss": 0.0974,
      "step": 80
    },
    {
      "epoch": 0.9986130374479889,
      "grad_norm": 0.25096118450164795,
      "learning_rate": 0.00019744105246469263,
      "loss": 0.099,
      "step": 90
    },
    {
      "epoch": 1.0998613037447988,
      "grad_norm": 0.25178226828575134,
      "learning_rate": 0.0001965692587766216,
      "loss": 0.0714,
      "step": 100
    },
    {
      "epoch": 1.2108183079056865,
      "grad_norm": 0.2704208195209503,
      "learning_rate": 0.00019557242096477327,
      "loss": 0.0771,
      "step": 110
    },
    {
      "epoch": 1.3217753120665743,
      "grad_norm": 0.22107760608196259,
      "learning_rate": 0.00019445182979923654,
      "loss": 0.0703,
      "step": 120
    },
    {
      "epoch": 1.4327323162274619,
      "grad_norm": 0.26953792572021484,
      "learning_rate": 0.00019320893629394873,
      "loss": 0.0753,
      "step": 130
    },
    {
      "epoch": 1.5436893203883495,
      "grad_norm": 0.2142401486635208,
      "learning_rate": 0.00019184534982782904,
      "loss": 0.0724,
      "step": 140
    },
    {
      "epoch": 1.6546463245492373,
      "grad_norm": 0.25699618458747864,
      "learning_rate": 0.00019036283606085053,
      "loss": 0.0648,
      "step": 150
    },
    {
      "epoch": 1.765603328710125,
      "grad_norm": 0.2224379926919937,
      "learning_rate": 0.00018876331464774945,
      "loss": 0.0706,
      "step": 160
    },
    {
      "epoch": 1.8765603328710125,
      "grad_norm": 0.23435620963573456,
      "learning_rate": 0.0001870488567523318,
      "loss": 0.0695,
      "step": 170
    },
    {
      "epoch": 1.9875173370319001,
      "grad_norm": 0.18676415085792542,
      "learning_rate": 0.00018522168236559695,
      "loss": 0.0615,
      "step": 180
    },
    {
      "epoch": 2.08876560332871,
      "grad_norm": 0.24162153899669647,
      "learning_rate": 0.00018328415743114912,
      "loss": 0.0445,
      "step": 190
    },
    {
      "epoch": 2.1997226074895977,
      "grad_norm": 0.3869277536869049,
      "learning_rate": 0.00018123879078162097,
      "loss": 0.0502,
      "step": 200
    },
    {
      "epoch": 2.3106796116504853,
      "grad_norm": 0.3037394881248474,
      "learning_rate": 0.00017908823089007457,
      "loss": 0.0482,
      "step": 210
    },
    {
      "epoch": 2.421636615811373,
      "grad_norm": 0.18976379930973053,
      "learning_rate": 0.00017683526244058716,
      "loss": 0.0528,
      "step": 220
    },
    {
      "epoch": 2.5325936199722605,
      "grad_norm": 0.30705705285072327,
      "learning_rate": 0.00017448280272246212,
      "loss": 0.0521,
      "step": 230
    },
    {
      "epoch": 2.6435506241331486,
      "grad_norm": 0.21610881388187408,
      "learning_rate": 0.000172033897852734,
      "loss": 0.0535,
      "step": 240
    },
    {
      "epoch": 2.754507628294036,
      "grad_norm": 0.18693220615386963,
      "learning_rate": 0.00016949171883185918,
      "loss": 0.0517,
      "step": 250
    },
    {
      "epoch": 2.8654646324549238,
      "grad_norm": 0.3321268558502197,
      "learning_rate": 0.0001668595574376992,
      "loss": 0.0407,
      "step": 260
    },
    {
      "epoch": 2.9764216366158114,
      "grad_norm": 0.20721495151519775,
      "learning_rate": 0.000164140821963114,
      "loss": 0.0417,
      "step": 270
    },
    {
      "epoch": 3.0776699029126213,
      "grad_norm": 0.20151656866073608,
      "learning_rate": 0.00016133903280268362,
      "loss": 0.0373,
      "step": 280
    },
    {
      "epoch": 3.188626907073509,
      "grad_norm": 0.3590203821659088,
      "learning_rate": 0.00015845781789427377,
      "loss": 0.0358,
      "step": 290
    },
    {
      "epoch": 3.2995839112343965,
      "grad_norm": 0.20630675554275513,
      "learning_rate": 0.000155500908021347,
      "loss": 0.0299,
      "step": 300
    },
    {
      "epoch": 3.410540915395284,
      "grad_norm": 0.3287246525287628,
      "learning_rate": 0.000152472131982103,
      "loss": 0.0331,
      "step": 310
    },
    {
      "epoch": 3.5214979195561718,
      "grad_norm": 0.24394913017749786,
      "learning_rate": 0.0001493754116317029,
      "loss": 0.0368,
      "step": 320
    },
    {
      "epoch": 3.63245492371706,
      "grad_norm": 0.20165830850601196,
      "learning_rate": 0.0001462147568039977,
      "loss": 0.0336,
      "step": 330
    },
    {
      "epoch": 3.7434119278779474,
      "grad_norm": 0.2538021504878998,
      "learning_rate": 0.00014299426011933568,
      "loss": 0.0295,
      "step": 340
    },
    {
      "epoch": 3.854368932038835,
      "grad_norm": 0.36229604482650757,
      "learning_rate": 0.00013971809168517298,
      "loss": 0.0358,
      "step": 350
    },
    {
      "epoch": 3.9653259361997226,
      "grad_norm": 0.4092184603214264,
      "learning_rate": 0.00013639049369634876,
      "loss": 0.034,
      "step": 360
    },
    {
      "epoch": 4.066574202496533,
      "grad_norm": 0.11960680782794952,
      "learning_rate": 0.00013301577494201664,
      "loss": 0.0233,
      "step": 370
    },
    {
      "epoch": 4.17753120665742,
      "grad_norm": 0.26415354013442993,
      "learning_rate": 0.00012959830522634596,
      "loss": 0.02,
      "step": 380
    },
    {
      "epoch": 4.288488210818308,
      "grad_norm": 0.21966516971588135,
      "learning_rate": 0.00012614250971021657,
      "loss": 0.0225,
      "step": 390
    },
    {
      "epoch": 4.399445214979195,
      "grad_norm": 0.2905697524547577,
      "learning_rate": 0.00012265286318123415,
      "loss": 0.0244,
      "step": 400
    },
    {
      "epoch": 4.510402219140083,
      "grad_norm": 0.24163606762886047,
      "learning_rate": 0.00011913388425948584,
      "loss": 0.017,
      "step": 410
    },
    {
      "epoch": 4.621359223300971,
      "grad_norm": 0.40009695291519165,
      "learning_rate": 0.00011559012954653865,
      "loss": 0.0219,
      "step": 420
    },
    {
      "epoch": 4.732316227461858,
      "grad_norm": 0.1963382512331009,
      "learning_rate": 0.0001120261877252568,
      "loss": 0.0179,
      "step": 430
    },
    {
      "epoch": 4.843273231622746,
      "grad_norm": 0.33989155292510986,
      "learning_rate": 0.00010844667361807842,
      "loss": 0.0198,
      "step": 440
    },
    {
      "epoch": 4.954230235783633,
      "grad_norm": 0.38484710454940796,
      "learning_rate": 0.00010485622221144484,
      "loss": 0.0249,
      "step": 450
    },
    {
      "epoch": 5.055478502080444,
      "grad_norm": 0.18945415318012238,
      "learning_rate": 0.00010125948265412033,
      "loss": 0.0177,
      "step": 460
    },
    {
      "epoch": 5.166435506241331,
      "grad_norm": 0.25906893610954285,
      "learning_rate": 9.766111223717352e-05,
      "loss": 0.0127,
      "step": 470
    },
    {
      "epoch": 5.277392510402219,
      "grad_norm": 0.23804187774658203,
      "learning_rate": 9.406577036341548e-05,
      "loss": 0.0128,
      "step": 480
    },
    {
      "epoch": 5.388349514563107,
      "grad_norm": 0.20456787943840027,
      "learning_rate": 9.047811251410376e-05,
      "loss": 0.0111,
      "step": 490
    },
    {
      "epoch": 5.499306518723994,
      "grad_norm": 0.15757159888744354,
      "learning_rate": 8.690278422072384e-05,
      "loss": 0.0101,
      "step": 500
    },
    {
      "epoch": 5.610263522884882,
      "grad_norm": 0.16691505908966064,
      "learning_rate": 8.334441504965455e-05,
      "loss": 0.0115,
      "step": 510
    },
    {
      "epoch": 5.721220527045769,
      "grad_norm": 0.5055399537086487,
      "learning_rate": 7.980761260750607e-05,
      "loss": 0.0088,
      "step": 520
    },
    {
      "epoch": 5.832177531206657,
      "grad_norm": 0.15076065063476562,
      "learning_rate": 7.629695657489257e-05,
      "loss": 0.0117,
      "step": 530
    },
    {
      "epoch": 5.943134535367545,
      "grad_norm": 0.09655993431806564,
      "learning_rate": 7.281699277636572e-05,
      "loss": 0.0111,
      "step": 540
    },
    {
      "epoch": 6.044382801664355,
      "grad_norm": 0.4866645336151123,
      "learning_rate": 6.93722272941869e-05,
      "loss": 0.0092,
      "step": 550
    },
    {
      "epoch": 6.155339805825243,
      "grad_norm": 0.1816895604133606,
      "learning_rate": 6.59671206335602e-05,
      "loss": 0.0082,
      "step": 560
    },
    {
      "epoch": 6.26629680998613,
      "grad_norm": 0.22271257638931274,
      "learning_rate": 6.260608194688206e-05,
      "loss": 0.0046,
      "step": 570
    },
    {
      "epoch": 6.377253814147018,
      "grad_norm": 0.06787201762199402,
      "learning_rate": 5.929346332448511e-05,
      "loss": 0.0051,
      "step": 580
    },
    {
      "epoch": 6.4882108183079055,
      "grad_norm": 0.09298055619001389,
      "learning_rate": 5.6033554159270294e-05,
      "loss": 0.0054,
      "step": 590
    },
    {
      "epoch": 6.599167822468793,
      "grad_norm": 0.03731105476617813,
      "learning_rate": 5.283057559252341e-05,
      "loss": 0.0053,
      "step": 600
    },
    {
      "epoch": 6.710124826629681,
      "grad_norm": 0.10652171820402145,
      "learning_rate": 4.96886750481082e-05,
      "loss": 0.0057,
      "step": 610
    },
    {
      "epoch": 6.821081830790568,
      "grad_norm": 0.2607424259185791,
      "learning_rate": 4.661192086211366e-05,
      "loss": 0.0077,
      "step": 620
    },
    {
      "epoch": 6.932038834951456,
      "grad_norm": 0.11328639835119247,
      "learning_rate": 4.360429701490934e-05,
      "loss": 0.0073,
      "step": 630
    },
    {
      "epoch": 7.033287101248266,
      "grad_norm": 0.0941685363650322,
      "learning_rate": 4.06696979724298e-05,
      "loss": 0.0039,
      "step": 640
    },
    {
      "epoch": 7.144244105409154,
      "grad_norm": 0.45776239037513733,
      "learning_rate": 3.7811923643367974e-05,
      "loss": 0.0032,
      "step": 650
    },
    {
      "epoch": 7.2552011095700415,
      "grad_norm": 0.08863729238510132,
      "learning_rate": 3.503467445880789e-05,
      "loss": 0.0026,
      "step": 660
    },
    {
      "epoch": 7.366158113730929,
      "grad_norm": 0.04661976918578148,
      "learning_rate": 3.2341546580666796e-05,
      "loss": 0.0024,
      "step": 670
    },
    {
      "epoch": 7.477115117891817,
      "grad_norm": 0.08003357797861099,
      "learning_rate": 2.9736027245152275e-05,
      "loss": 0.0022,
      "step": 680
    },
    {
      "epoch": 7.588072122052704,
      "grad_norm": 0.15967042744159698,
      "learning_rate": 2.722149024726307e-05,
      "loss": 0.0024,
      "step": 690
    },
    {
      "epoch": 7.699029126213592,
      "grad_norm": 0.0572751984000206,
      "learning_rate": 2.480119157218108e-05,
      "loss": 0.003,
      "step": 700
    },
    {
      "epoch": 7.8099861303744795,
      "grad_norm": 0.0780700072646141,
      "learning_rate": 2.247826517921121e-05,
      "loss": 0.0035,
      "step": 710
    },
    {
      "epoch": 7.920943134535367,
      "grad_norm": 0.19474399089813232,
      "learning_rate": 2.025571894372794e-05,
      "loss": 0.0027,
      "step": 720
    },
    {
      "epoch": 8.022191400832178,
      "grad_norm": 0.12848657369613647,
      "learning_rate": 1.813643076238375e-05,
      "loss": 0.002,
      "step": 730
    },
    {
      "epoch": 8.133148404993065,
      "grad_norm": 0.05772533640265465,
      "learning_rate": 1.6123144826622504e-05,
      "loss": 0.0017,
      "step": 740
    },
    {
      "epoch": 8.244105409153953,
      "grad_norm": 0.14121367037296295,
      "learning_rate": 1.4218468069322578e-05,
      "loss": 0.0013,
      "step": 750
    },
    {
      "epoch": 8.35506241331484,
      "grad_norm": 0.14342299103736877,
      "learning_rate": 1.2424866789171729e-05,
      "loss": 0.0016,
      "step": 760
    },
    {
      "epoch": 8.466019417475728,
      "grad_norm": 0.03438349440693855,
      "learning_rate": 1.0744663457143878e-05,
      "loss": 0.0011,
      "step": 770
    },
    {
      "epoch": 8.576976421636616,
      "grad_norm": 0.0756613090634346,
      "learning_rate": 9.180033709213454e-06,
      "loss": 0.0017,
      "step": 780
    },
    {
      "epoch": 8.687933425797503,
      "grad_norm": 0.0464102178812027,
      "learning_rate": 7.733003529201278e-06,
      "loss": 0.0014,
      "step": 790
    },
    {
      "epoch": 8.79889042995839,
      "grad_norm": 0.12452979385852814,
      "learning_rate": 6.405446625399481e-06,
      "loss": 0.0015,
      "step": 800
    },
    {
      "epoch": 8.909847434119278,
      "grad_norm": 0.08071909099817276,
      "learning_rate": 5.199082004372957e-06,
      "loss": 0.0014,
      "step": 810
    },
    {
      "epoch": 9.011095700416089,
      "grad_norm": 0.06948132812976837,
      "learning_rate": 4.115471745078314e-06,
      "loss": 0.0012,
      "step": 820
    },
    {
      "epoch": 9.122052704576976,
      "grad_norm": 0.07605510950088501,
      "learning_rate": 3.1560189761830728e-06,
      "loss": 0.0009,
      "step": 830
    },
    {
      "epoch": 9.233009708737864,
      "grad_norm": 0.0312280785292387,
      "learning_rate": 2.3219660592038285e-06,
      "loss": 0.0012,
      "step": 840
    },
    {
      "epoch": 9.343966712898752,
      "grad_norm": 0.02329327166080475,
      "learning_rate": 1.6143929798162704e-06,
      "loss": 0.001,
      "step": 850
    },
    {
      "epoch": 9.45492371705964,
      "grad_norm": 0.08054498583078384,
      "learning_rate": 1.034215949419748e-06,
      "loss": 0.0012,
      "step": 860
    },
    {
      "epoch": 9.565880721220527,
      "grad_norm": 0.09850303828716278,
      "learning_rate": 5.821862187675775e-07,
      "loss": 0.0011,
      "step": 870
    },
    {
      "epoch": 9.676837725381414,
      "grad_norm": 0.08373916149139404,
      "learning_rate": 2.588891051988895e-07,
      "loss": 0.0019,
      "step": 880
    },
    {
      "epoch": 9.787794729542302,
      "grad_norm": 0.017217393964529037,
      "learning_rate": 6.474323473194543e-08,
      "loss": 0.0009,
      "step": 890
    },
    {
      "epoch": 9.89875173370319,
      "grad_norm": 0.04848321154713631,
      "learning_rate": 0.0,
      "loss": 0.0009,
      "step": 900
    }
  ],
  "logging_steps": 10,
  "max_steps": 900,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 9.301284175906406e+16,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}