{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 500,
  "global_step": 682,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.014670823399963322,
      "grad_norm": 4.686307907104492,
      "learning_rate": 2.608695652173913e-05,
      "loss": 1.086164951324463,
      "step": 10
    },
    {
      "epoch": 0.029341646799926645,
      "grad_norm": 1.3878921270370483,
      "learning_rate": 5.507246376811594e-05,
      "loss": 0.13360737562179564,
      "step": 20
    },
    {
      "epoch": 0.04401247019988997,
      "grad_norm": 0.7091318368911743,
      "learning_rate": 8.405797101449276e-05,
      "loss": 0.09536288380622863,
      "step": 30
    },
    {
      "epoch": 0.05868329359985329,
      "grad_norm": 0.9854250550270081,
      "learning_rate": 0.00011304347826086956,
      "loss": 0.06767122745513916,
      "step": 40
    },
    {
      "epoch": 0.07335411699981662,
      "grad_norm": 0.6373249888420105,
      "learning_rate": 0.00014202898550724638,
      "loss": 0.07146756052970886,
      "step": 50
    },
    {
      "epoch": 0.08802494039977994,
      "grad_norm": 0.6071982979774475,
      "learning_rate": 0.0001710144927536232,
      "loss": 0.07726313471794129,
      "step": 60
    },
    {
      "epoch": 0.10269576379974327,
      "grad_norm": 0.9975550174713135,
      "learning_rate": 0.0002,
      "loss": 0.06706693768501282,
      "step": 70
    },
    {
      "epoch": 0.11736658719970658,
      "grad_norm": 0.5094273090362549,
      "learning_rate": 0.00019986870332074194,
      "loss": 0.07236328125,
      "step": 80
    },
    {
      "epoch": 0.13203741059966992,
      "grad_norm": 0.7871644496917725,
      "learning_rate": 0.00019947515805932744,
      "loss": 0.07177542448043824,
      "step": 90
    },
    {
      "epoch": 0.14670823399963323,
      "grad_norm": 1.741397738456726,
      "learning_rate": 0.0001988203976394757,
      "loss": 0.048826560378074646,
      "step": 100
    },
    {
      "epoch": 0.16137905739959654,
      "grad_norm": 0.5896034836769104,
      "learning_rate": 0.0001979061414185635,
      "loss": 0.06617986559867858,
      "step": 110
    },
    {
      "epoch": 0.17604988079955988,
      "grad_norm": 0.7956095337867737,
      "learning_rate": 0.0001967347901727067,
      "loss": 0.08305451273918152,
      "step": 120
    },
    {
      "epoch": 0.1907207041995232,
      "grad_norm": 0.492136150598526,
      "learning_rate": 0.0001953094197924819,
      "loss": 0.06995530128479004,
      "step": 130
    },
    {
      "epoch": 0.20539152759948653,
      "grad_norm": 0.6558647751808167,
      "learning_rate": 0.00019363377320584174,
      "loss": 0.05795600414276123,
      "step": 140
    },
    {
      "epoch": 0.22006235099944985,
      "grad_norm": 0.5106936097145081,
      "learning_rate": 0.000191712250549435,
      "loss": 0.055801987648010254,
      "step": 150
    },
    {
      "epoch": 0.23473317439941316,
      "grad_norm": 0.44069793820381165,
      "learning_rate": 0.0001895498976141398,
      "loss": 0.06355689167976379,
      "step": 160
    },
    {
      "epoch": 0.2494039977993765,
      "grad_norm": 0.405668169260025,
      "learning_rate": 0.00018715239259515184,
      "loss": 0.05164743065834045,
      "step": 170
    },
    {
      "epoch": 0.26407482119933984,
      "grad_norm": 0.3792116045951843,
      "learning_rate": 0.00018452603118142112,
      "loss": 0.0679062008857727,
      "step": 180
    },
    {
      "epoch": 0.2787456445993031,
      "grad_norm": 0.6442562937736511,
      "learning_rate": 0.00018167771002359072,
      "loss": 0.06293455362319947,
      "step": 190
    },
    {
      "epoch": 0.29341646799926646,
      "grad_norm": 0.5488963723182678,
      "learning_rate": 0.0001786149086238503,
      "loss": 0.0556623637676239,
      "step": 200
    },
    {
      "epoch": 0.3080872913992298,
      "grad_norm": 0.29444584250450134,
      "learning_rate": 0.0001753456696952601,
      "loss": 0.0685071349143982,
      "step": 210
    },
    {
      "epoch": 0.3227581147991931,
      "grad_norm": 0.414754718542099,
      "learning_rate": 0.0001718785780421207,
      "loss": 0.06181240677833557,
      "step": 220
    },
    {
      "epoch": 0.3374289381991564,
      "grad_norm": 0.32311928272247314,
      "learning_rate": 0.00016822273801684682,
      "loss": 0.07574231624603271,
      "step": 230
    },
    {
      "epoch": 0.35209976159911976,
      "grad_norm": 0.48792004585266113,
      "learning_rate": 0.00016438774961254285,
      "loss": 0.043923291563987735,
      "step": 240
    },
    {
      "epoch": 0.36677058499908305,
      "grad_norm": 1.073688268661499,
      "learning_rate": 0.00016038368325405834,
      "loss": 0.05729702115058899,
      "step": 250
    },
    {
      "epoch": 0.3814414083990464,
      "grad_norm": 0.5740509629249573,
      "learning_rate": 0.00015622105335372127,
      "loss": 0.06541360020637513,
      "step": 260
    },
    {
      "epoch": 0.39611223179900973,
      "grad_norm": 0.6383430361747742,
      "learning_rate": 0.0001519107907011895,
      "loss": 0.057945191860198975,
      "step": 270
    },
    {
      "epoch": 0.41078305519897307,
      "grad_norm": 0.46824830770492554,
      "learning_rate": 0.0001474642137599232,
      "loss": 0.05833690166473389,
      "step": 280
    },
    {
      "epoch": 0.42545387859893635,
      "grad_norm": 0.689471960067749,
      "learning_rate": 0.00014289299894565147,
      "loss": 0.055763131380081175,
      "step": 290
    },
    {
      "epoch": 0.4401247019988997,
      "grad_norm": 0.362657368183136,
      "learning_rate": 0.00013820914996488004,
      "loss": 0.07186369895935059,
      "step": 300
    },
    {
      "epoch": 0.45479552539886303,
      "grad_norm": 0.40035027265548706,
      "learning_rate": 0.00013342496629395538,
      "loss": 0.06522900462150574,
      "step": 310
    },
    {
      "epoch": 0.4694663487988263,
      "grad_norm": 0.4143030345439911,
      "learning_rate": 0.00012855301088145652,
      "loss": 0.040158060193061826,
      "step": 320
    },
    {
      "epoch": 0.48413717219878966,
      "grad_norm": 0.24200226366519928,
      "learning_rate": 0.0001236060771587266,
      "loss": 0.05714940428733826,
      "step": 330
    },
    {
      "epoch": 0.498807995598753,
      "grad_norm": 0.2773888111114502,
      "learning_rate": 0.00011859715544517164,
      "loss": 0.04442446827888489,
      "step": 340
    },
    {
      "epoch": 0.5134788189987163,
      "grad_norm": 0.3424382209777832,
      "learning_rate": 0.00011353939883654476,
      "loss": 0.0499860942363739,
      "step": 350
    },
    {
      "epoch": 0.5281496423986797,
      "grad_norm": 0.4579457938671112,
      "learning_rate": 0.0001084460886657901,
      "loss": 0.05433698296546936,
      "step": 360
    },
    {
      "epoch": 0.542820465798643,
      "grad_norm": 0.6211843490600586,
      "learning_rate": 0.00010333059962714469,
      "loss": 0.048888799548149106,
      "step": 370
    },
    {
      "epoch": 0.5574912891986062,
      "grad_norm": 0.5006217360496521,
      "learning_rate": 9.820636465507961e-05,
      "loss": 0.05106990933418274,
      "step": 380
    },
    {
      "epoch": 0.5721621125985696,
      "grad_norm": 0.3458799719810486,
      "learning_rate": 9.308683965030631e-05,
      "loss": 0.03896746933460236,
      "step": 390
    },
    {
      "epoch": 0.5868329359985329,
      "grad_norm": 0.29490038752555847,
      "learning_rate": 8.798546814547487e-05,
      "loss": 0.044534245133399965,
      "step": 400
    },
    {
      "epoch": 0.6015037593984962,
      "grad_norm": 0.28574299812316895,
      "learning_rate": 8.291564600335022e-05,
      "loss": 0.04822182059288025,
      "step": 410
    },
    {
      "epoch": 0.6161745827984596,
      "grad_norm": 0.3831021785736084,
      "learning_rate": 7.789068624016616e-05,
      "loss": 0.04143353998661041,
      "step": 420
    },
    {
      "epoch": 0.6308454061984229,
      "grad_norm": 0.3398614525794983,
      "learning_rate": 7.292378406652891e-05,
      "loss": 0.039598295092582704,
      "step": 430
    },
    {
      "epoch": 0.6455162295983862,
      "grad_norm": 0.22855930030345917,
      "learning_rate": 6.802798223767044e-05,
      "loss": 0.04850543141365051,
      "step": 440
    },
    {
      "epoch": 0.6601870529983496,
      "grad_norm": 0.26312509179115295,
      "learning_rate": 6.321613680403946e-05,
      "loss": 0.05014724731445312,
      "step": 450
    },
    {
      "epoch": 0.6748578763983129,
      "grad_norm": 0.343281090259552,
      "learning_rate": 5.8500883352166715e-05,
      "loss": 0.03788905143737793,
      "step": 460
    },
    {
      "epoch": 0.6895286997982761,
      "grad_norm": 0.6305585503578186,
      "learning_rate": 5.3894603824454056e-05,
      "loss": 0.05027334094047546,
      "step": 470
    },
    {
      "epoch": 0.7041995231982395,
      "grad_norm": 0.09157969057559967,
      "learning_rate": 4.940939400501593e-05,
      "loss": 0.04882456958293915,
      "step": 480
    },
    {
      "epoch": 0.7188703465982028,
      "grad_norm": 0.2803705632686615,
      "learning_rate": 4.505703175695366e-05,
      "loss": 0.051465296745300294,
      "step": 490
    },
    {
      "epoch": 0.7335411699981661,
      "grad_norm": 0.32852405309677124,
      "learning_rate": 4.0848946094469334e-05,
      "loss": 0.04661123156547546,
      "step": 500
    },
    {
      "epoch": 0.7482119933981295,
      "grad_norm": 0.31889474391937256,
      "learning_rate": 3.679618717103316e-05,
      "loss": 0.04635309278964996,
      "step": 510
    },
    {
      "epoch": 0.7628828167980928,
      "grad_norm": 0.2887394428253174,
      "learning_rate": 3.2909397262414845e-05,
      "loss": 0.04521143436431885,
      "step": 520
    },
    {
      "epoch": 0.7775536401980561,
      "grad_norm": 0.4780530035495758,
      "learning_rate": 2.9198782820773828e-05,
      "loss": 0.041201579570770266,
      "step": 530
    },
    {
      "epoch": 0.7922244635980195,
      "grad_norm": 0.4354000985622406,
      "learning_rate": 2.5674087673194115e-05,
      "loss": 0.036979615688323975,
      "step": 540
    },
    {
      "epoch": 0.8068952869979827,
      "grad_norm": 0.11631964892148972,
      "learning_rate": 2.2344567435041054e-05,
      "loss": 0.03683710396289826,
      "step": 550
    },
    {
      "epoch": 0.8215661103979461,
      "grad_norm": 0.40628868341445923,
      "learning_rate": 1.9218965205330576e-05,
      "loss": 0.04675011336803436,
      "step": 560
    },
    {
      "epoch": 0.8362369337979094,
      "grad_norm": 0.31028568744659424,
      "learning_rate": 1.6305488607931486e-05,
      "loss": 0.033157148957252504,
      "step": 570
    },
    {
      "epoch": 0.8509077571978727,
      "grad_norm": 0.26061493158340454,
      "learning_rate": 1.3611788238890511e-05,
      "loss": 0.04655841886997223,
      "step": 580
    },
    {
      "epoch": 0.8655785805978361,
      "grad_norm": 0.2419964224100113,
      "learning_rate": 1.114493757647508e-05,
      "loss": 0.030328187346458434,
      "step": 590
    },
    {
      "epoch": 0.8802494039977994,
      "grad_norm": 0.3614746332168579,
      "learning_rate": 8.911414406689145e-06,
      "loss": 0.03218616545200348,
      "step": 600
    },
    {
      "epoch": 0.8949202273977627,
      "grad_norm": 0.37310630083084106,
      "learning_rate": 6.9170838130375505e-06,
      "loss": 0.04881116449832916,
      "step": 610
    },
    {
      "epoch": 0.9095910507977261,
      "grad_norm": 0.2813904881477356,
      "learning_rate": 5.167182775206026e-06,
      "loss": 0.05659586191177368,
      "step": 620
    },
    {
      "epoch": 0.9242618741976893,
      "grad_norm": 0.13661852478981018,
      "learning_rate": 3.6663064171005956e-06,
      "loss": 0.042176204919815066,
      "step": 630
    },
    {
      "epoch": 0.9389326975976526,
      "grad_norm": 0.3304874300956726,
      "learning_rate": 2.418395940357099e-06,
      "loss": 0.05347890257835388,
      "step": 640
    },
    {
      "epoch": 0.953603520997616,
      "grad_norm": 0.34348776936531067,
      "learning_rate": 1.4267282750077493e-06,
      "loss": 0.04495801329612732,
      "step": 650
    },
    {
      "epoch": 0.9682743443975793,
      "grad_norm": 0.27644288539886475,
      "learning_rate": 6.93907474480282e-07,
      "loss": 0.04055593609809875,
      "step": 660
    },
    {
      "epoch": 0.9829451677975426,
      "grad_norm": 0.2929557263851166,
      "learning_rate": 2.2185787752672104e-07,
      "loss": 0.03816842138767242,
      "step": 670
    },
    {
      "epoch": 0.997615991197506,
      "grad_norm": 0.2137778252363205,
      "learning_rate": 1.1819055037554095e-08,
      "loss": 0.05327551364898682,
      "step": 680
    },
    {
      "epoch": 1.0,
      "step": 682,
      "total_flos": 3.3343785449628713e+18,
      "train_loss": 0.07003523399807578,
      "train_runtime": 66098.6339,
      "train_samples_per_second": 0.165,
      "train_steps_per_second": 0.01
    }
  ],
  "logging_steps": 10,
  "max_steps": 682,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 3.3343785449628713e+18,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}