{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 4029,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.02482005460412013,
      "grad_norm": 0.1369238644838333,
      "learning_rate": 6.666666666666667e-06,
      "loss": 2.4042,
      "step": 100
    },
    {
      "epoch": 0.04964010920824026,
      "grad_norm": 0.17341101169586182,
      "learning_rate": 1.3333333333333333e-05,
      "loss": 2.3581,
      "step": 200
    },
    {
      "epoch": 0.07446016381236038,
      "grad_norm": 0.30298689007759094,
      "learning_rate": 2e-05,
      "loss": 2.2979,
      "step": 300
    },
    {
      "epoch": 0.09928021841648052,
      "grad_norm": 0.4181392788887024,
      "learning_rate": 1.9964532702725803e-05,
      "loss": 2.2736,
      "step": 400
    },
    {
      "epoch": 0.12410027302060064,
      "grad_norm": 0.4833754301071167,
      "learning_rate": 1.9858382396738395e-05,
      "loss": 2.2352,
      "step": 500
    },
    {
      "epoch": 0.14892032762472077,
      "grad_norm": 0.5508949756622314,
      "learning_rate": 1.9682302054929414e-05,
      "loss": 2.1951,
      "step": 600
    },
    {
      "epoch": 0.17374038222884092,
      "grad_norm": 0.5856565833091736,
      "learning_rate": 1.943754069606428e-05,
      "loss": 2.1662,
      "step": 700
    },
    {
      "epoch": 0.19856043683296104,
      "grad_norm": 0.5611233115196228,
      "learning_rate": 1.9125834524918215e-05,
      "loss": 2.1815,
      "step": 800
    },
    {
      "epoch": 0.22338049143708116,
      "grad_norm": 0.6802138090133667,
      "learning_rate": 1.8749394616578068e-05,
      "loss": 2.1675,
      "step": 900
    },
    {
      "epoch": 0.2482005460412013,
      "grad_norm": 0.6513592004776001,
      "learning_rate": 1.8310891232270827e-05,
      "loss": 2.1402,
      "step": 1000
    },
    {
      "epoch": 0.2730206006453214,
      "grad_norm": 0.6889598369598389,
      "learning_rate": 1.781343487797389e-05,
      "loss": 2.1334,
      "step": 1100
    },
    {
      "epoch": 0.29784065524944153,
      "grad_norm": 0.7928256988525391,
      "learning_rate": 1.7260554240167017e-05,
      "loss": 2.1295,
      "step": 1200
    },
    {
      "epoch": 0.32266070985356166,
      "grad_norm": 0.7162489295005798,
      "learning_rate": 1.665617115523785e-05,
      "loss": 2.1232,
      "step": 1300
    },
    {
      "epoch": 0.34748076445768183,
      "grad_norm": 0.7136086225509644,
      "learning_rate": 1.6004572790094535e-05,
      "loss": 2.1148,
      "step": 1400
    },
    {
      "epoch": 0.37230081906180196,
      "grad_norm": 0.7688263654708862,
      "learning_rate": 1.531038123132105e-05,
      "loss": 2.0873,
      "step": 1500
    },
    {
      "epoch": 0.3971208736659221,
      "grad_norm": 0.772521436214447,
      "learning_rate": 1.4578520698593441e-05,
      "loss": 2.117,
      "step": 1600
    },
    {
      "epoch": 0.4219409282700422,
      "grad_norm": 1.010330080986023,
      "learning_rate": 1.3814182614927217e-05,
      "loss": 2.071,
      "step": 1700
    },
    {
      "epoch": 0.4467609828741623,
      "grad_norm": 0.6752054691314697,
      "learning_rate": 1.3022788781528653e-05,
      "loss": 2.0636,
      "step": 1800
    },
    {
      "epoch": 0.47158103747828245,
      "grad_norm": 0.841232180595398,
      "learning_rate": 1.220995291846777e-05,
      "loss": 2.0532,
      "step": 1900
    },
    {
      "epoch": 0.4964010920824026,
      "grad_norm": 0.7984778881072998,
      "learning_rate": 1.1381440843982634e-05,
      "loss": 2.0438,
      "step": 2000
    },
    {
      "epoch": 0.5212211466865228,
      "grad_norm": 0.8068585395812988,
      "learning_rate": 1.0543129574881446e-05,
      "loss": 2.0687,
      "step": 2100
    },
    {
      "epoch": 0.5460412012906428,
      "grad_norm": 0.8497598767280579,
      "learning_rate": 9.700965638162112e-06,
      "loss": 2.0477,
      "step": 2200
    },
    {
      "epoch": 0.570861255894763,
      "grad_norm": 0.7474705576896667,
      "learning_rate": 8.860922889564078e-06,
      "loss": 2.0429,
      "step": 2300
    },
    {
      "epoch": 0.5956813104988831,
      "grad_norm": 1.0781651735305786,
      "learning_rate": 8.028960138264857e-06,
      "loss": 2.0389,
      "step": 2400
    },
    {
      "epoch": 0.6205013651030032,
      "grad_norm": 0.8750322461128235,
      "learning_rate": 7.21097887830873e-06,
      "loss": 2.046,
      "step": 2500
    },
    {
      "epoch": 0.6453214197071233,
      "grad_norm": 0.9259145855903625,
      "learning_rate": 6.4127814265980095e-06,
      "loss": 2.0243,
      "step": 2600
    },
    {
      "epoch": 0.6701414743112435,
      "grad_norm": 1.1625196933746338,
      "learning_rate": 5.640029764393366e-06,
      "loss": 2.0513,
      "step": 2700
    },
    {
      "epoch": 0.6949615289153637,
      "grad_norm": 0.8271129727363586,
      "learning_rate": 4.8982053742793025e-06,
      "loss": 2.0228,
      "step": 2800
    },
    {
      "epoch": 0.7197815835194837,
      "grad_norm": 0.7196031212806702,
      "learning_rate": 4.1925703574897115e-06,
      "loss": 2.0496,
      "step": 2900
    },
    {
      "epoch": 0.7446016381236039,
      "grad_norm": 0.7880265712738037,
      "learning_rate": 3.528130107406099e-06,
      "loss": 2.0145,
      "step": 3000
    },
    {
      "epoch": 0.769421692727724,
      "grad_norm": 0.909106433391571,
      "learning_rate": 2.909597804002603e-06,
      "loss": 2.0437,
      "step": 3100
    },
    {
      "epoch": 0.7942417473318442,
      "grad_norm": 1.2606161832809448,
      "learning_rate": 2.341360981094921e-06,
      "loss": 2.0443,
      "step": 3200
    },
    {
      "epoch": 0.8190618019359642,
      "grad_norm": 0.795652449131012,
      "learning_rate": 1.8274504035470942e-06,
      "loss": 2.0568,
      "step": 3300
    },
    {
      "epoch": 0.8438818565400844,
      "grad_norm": 0.8904260993003845,
      "learning_rate": 1.3715114752043746e-06,
      "loss": 2.0787,
      "step": 3400
    },
    {
      "epoch": 0.8687019111442045,
      "grad_norm": 1.0925287008285522,
      "learning_rate": 9.767783803688414e-07,
      "loss": 2.045,
      "step": 3500
    },
    {
      "epoch": 0.8935219657483247,
      "grad_norm": 0.799608588218689,
      "learning_rate": 6.460511422441984e-07,
      "loss": 2.0167,
      "step": 3600
    },
    {
      "epoch": 0.9183420203524447,
      "grad_norm": 0.9094216227531433,
      "learning_rate": 3.8167576108468994e-07,
      "loss": 2.057,
      "step": 3700
    },
    {
      "epoch": 0.9431620749565649,
      "grad_norm": 0.8395094871520996,
      "learning_rate": 1.855275729374284e-07,
      "loss": 2.0425,
      "step": 3800
    },
    {
      "epoch": 0.9679821295606851,
      "grad_norm": 0.8606423735618591,
      "learning_rate": 5.89979470221802e-08,
      "loss": 2.0208,
      "step": 3900
    },
    {
      "epoch": 0.9928021841648051,
      "grad_norm": 0.8908767700195312,
      "learning_rate": 2.9844161102077218e-09,
      "loss": 2.0512,
      "step": 4000
    },
    {
      "epoch": 1.0,
      "step": 4029,
      "total_flos": 7.32108351012864e+16,
      "train_loss": 2.105005581936963,
      "train_runtime": 1251.4031,
      "train_samples_per_second": 6.438,
      "train_steps_per_second": 3.22
    }
  ],
  "logging_steps": 100,
  "max_steps": 4029,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 7.32108351012864e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}