{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 11.444444444444445,
  "eval_steps": 500,
  "global_step": 4200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.136332651670075,
      "grad_norm": 11.269988059997559,
      "learning_rate": 0.00010652173913043477,
      "loss": 39.8198193359375,
      "step": 50
    },
    {
      "epoch": 0.27266530334015,
      "grad_norm": 14.962249755859375,
      "learning_rate": 0.0002152173913043478,
      "loss": 26.48206298828125,
      "step": 100
    },
    {
      "epoch": 0.40899795501022496,
      "grad_norm": 16.134239196777344,
      "learning_rate": 0.0003239130434782608,
      "loss": 19.36090087890625,
      "step": 150
    },
    {
      "epoch": 0.5453306066803,
      "grad_norm": 12.964629173278809,
      "learning_rate": 0.00043260869565217385,
      "loss": 14.5181689453125,
      "step": 200
    },
    {
      "epoch": 0.6816632583503749,
      "grad_norm": 10.061964988708496,
      "learning_rate": 0.0005413043478260869,
      "loss": 12.766522216796876,
      "step": 250
    },
    {
      "epoch": 0.8179959100204499,
      "grad_norm": 6.430169105529785,
      "learning_rate": 0.0005999713580566041,
      "loss": 11.881512451171876,
      "step": 300
    },
    {
      "epoch": 0.9543285616905249,
      "grad_norm": 5.836061954498291,
      "learning_rate": 0.0005997115106245061,
      "loss": 11.075633544921875,
      "step": 350
    },
    {
      "epoch": 1.0899795501022496,
      "grad_norm": 4.700469017028809,
      "learning_rate": 0.000599181221756225,
      "loss": 10.015964965820313,
      "step": 400
    },
    {
      "epoch": 1.2263122017723245,
      "grad_norm": 4.282314777374268,
      "learning_rate": 0.0005983809699521793,
      "loss": 9.592711181640626,
      "step": 450
    },
    {
      "epoch": 1.3626448534423994,
      "grad_norm": 3.662461280822754,
      "learning_rate": 0.0005973114773109183,
      "loss": 9.211348266601563,
      "step": 500
    },
    {
      "epoch": 1.4989775051124745,
      "grad_norm": 3.783862590789795,
      "learning_rate": 0.0005959737088775463,
      "loss": 8.782565307617187,
      "step": 550
    },
    {
      "epoch": 1.6353101567825494,
      "grad_norm": 3.429069757461548,
      "learning_rate": 0.0005943688717729229,
      "loss": 8.386593627929688,
      "step": 600
    },
    {
      "epoch": 1.7716428084526243,
      "grad_norm": 2.529482841491699,
      "learning_rate": 0.0005924984141044315,
      "loss": 8.06916259765625,
      "step": 650
    },
    {
      "epoch": 1.9079754601226995,
      "grad_norm": 2.4761486053466797,
      "learning_rate": 0.0005903640236592949,
      "loss": 7.7736474609375,
      "step": 700
    },
    {
      "epoch": 2.043626448534424,
      "grad_norm": 2.3675243854522705,
      "learning_rate": 0.0005879676263816192,
      "loss": 7.4699859619140625,
      "step": 750
    },
    {
      "epoch": 2.179959100204499,
      "grad_norm": 2.305575132369995,
      "learning_rate": 0.0005853113846345384,
      "loss": 7.305302734375,
      "step": 800
    },
    {
      "epoch": 2.316291751874574,
      "grad_norm": 2.3954548835754395,
      "learning_rate": 0.0005823976952490298,
      "loss": 7.102890625,
      "step": 850
    },
    {
      "epoch": 2.452624403544649,
      "grad_norm": 2.2483468055725098,
      "learning_rate": 0.0005792291873611596,
      "loss": 6.9435498046875,
      "step": 900
    },
    {
      "epoch": 2.588957055214724,
      "grad_norm": 2.0154542922973633,
      "learning_rate": 0.00057580872003971,
      "loss": 6.761861572265625,
      "step": 950
    },
    {
      "epoch": 2.7252897068847988,
      "grad_norm": 2.155539035797119,
      "learning_rate": 0.00057213937970633,
      "loss": 6.58298583984375,
      "step": 1000
    },
    {
      "epoch": 2.861622358554874,
      "grad_norm": 1.7670893669128418,
      "learning_rate": 0.0005682244773505363,
      "loss": 6.419741821289063,
      "step": 1050
    },
    {
      "epoch": 2.997955010224949,
      "grad_norm": 2.3611533641815186,
      "learning_rate": 0.0005640675455420765,
      "loss": 6.288121948242187,
      "step": 1100
    },
    {
      "epoch": 3.1336059986366736,
      "grad_norm": 1.8783221244812012,
      "learning_rate": 0.0005596723352433551,
      "loss": 6.090737915039062,
      "step": 1150
    },
    {
      "epoch": 3.2699386503067487,
      "grad_norm": 2.2795541286468506,
      "learning_rate": 0.0005550428124247912,
      "loss": 5.979439086914063,
      "step": 1200
    },
    {
      "epoch": 3.4062713019768234,
      "grad_norm": 1.3074142932891846,
      "learning_rate": 0.0005501831544861696,
      "loss": 5.897046508789063,
      "step": 1250
    },
    {
      "epoch": 3.5426039536468985,
      "grad_norm": 1.259432077407837,
      "learning_rate": 0.0005450977464872081,
      "loss": 5.734913940429688,
      "step": 1300
    },
    {
      "epoch": 3.6789366053169736,
      "grad_norm": 1.4511066675186157,
      "learning_rate": 0.0005397911771907473,
      "loss": 5.604786987304688,
      "step": 1350
    },
    {
      "epoch": 3.8152692569870483,
      "grad_norm": 1.428958535194397,
      "learning_rate": 0.0005342682349221297,
      "loss": 5.445667114257812,
      "step": 1400
    },
    {
      "epoch": 3.9516019086571235,
      "grad_norm": 1.2921603918075562,
      "learning_rate": 0.000528533903248506,
      "loss": 5.391282958984375,
      "step": 1450
    },
    {
      "epoch": 4.087252897068848,
      "grad_norm": 1.3616167306900024,
      "learning_rate": 0.0005225933564819676,
      "loss": 5.183615112304688,
      "step": 1500
    },
    {
      "epoch": 4.223585548738923,
      "grad_norm": 1.2340154647827148,
      "learning_rate": 0.0005164519550105623,
      "loss": 5.060681457519531,
      "step": 1550
    },
    {
      "epoch": 4.359918200408998,
      "grad_norm": 1.5526384115219116,
      "learning_rate": 0.0005101152404614052,
      "loss": 4.902400817871094,
      "step": 1600
    },
    {
      "epoch": 4.496250852079073,
      "grad_norm": 1.4958291053771973,
      "learning_rate": 0.0005035889307002529,
      "loss": 4.787099304199219,
      "step": 1650
    },
    {
      "epoch": 4.632583503749148,
      "grad_norm": 1.4236118793487549,
      "learning_rate": 0.0004968789146720478,
      "loss": 4.660638427734375,
      "step": 1700
    },
    {
      "epoch": 4.768916155419223,
      "grad_norm": 1.3592256307601929,
      "learning_rate": 0.0004899912470870939,
      "loss": 4.454691162109375,
      "step": 1750
    },
    {
      "epoch": 4.905248807089298,
      "grad_norm": 1.502578616142273,
      "learning_rate": 0.00048293214295765303,
      "loss": 4.297479553222656,
      "step": 1800
    },
    {
      "epoch": 5.040899795501023,
      "grad_norm": 1.7106261253356934,
      "learning_rate": 0.0004757079719898968,
      "loss": 4.13409423828125,
      "step": 1850
    },
    {
      "epoch": 5.1772324471710975,
      "grad_norm": 1.2559808492660522,
      "learning_rate": 0.00046832525283627114,
      "loss": 3.96047607421875,
      "step": 1900
    },
    {
      "epoch": 5.313565098841172,
      "grad_norm": 1.2694923877716064,
      "learning_rate": 0.0004607906472134603,
      "loss": 3.8196981811523436,
      "step": 1950
    },
    {
      "epoch": 5.449897750511248,
      "grad_norm": 1.6992137432098389,
      "learning_rate": 0.0004531109538912596,
      "loss": 3.6628662109375,
      "step": 2000
    },
    {
      "epoch": 5.5862304021813225,
      "grad_norm": 1.453190803527832,
      "learning_rate": 0.00044529310255777855,
      "loss": 3.52033935546875,
      "step": 2050
    },
    {
      "epoch": 5.722563053851397,
      "grad_norm": 1.504873514175415,
      "learning_rate": 0.0004373441475665124,
      "loss": 3.3988775634765624,
      "step": 2100
    },
    {
      "epoch": 5.858895705521473,
      "grad_norm": 1.4465556144714355,
      "learning_rate": 0.00042927126157092204,
      "loss": 3.2702841186523437,
      "step": 2150
    },
    {
      "epoch": 5.9952283571915475,
      "grad_norm": 1.4014344215393066,
      "learning_rate": 0.0004210817290522684,
      "loss": 3.1291094970703126,
      "step": 2200
    },
    {
      "epoch": 6.130879345603272,
      "grad_norm": 1.645821213722229,
      "learning_rate": 0.00041278293974653904,
      "loss": 2.936179504394531,
      "step": 2250
    },
    {
      "epoch": 6.267211997273347,
      "grad_norm": 1.6427346467971802,
      "learning_rate": 0.00040438238197640066,
      "loss": 2.857735900878906,
      "step": 2300
    },
    {
      "epoch": 6.403544648943422,
      "grad_norm": 1.66471529006958,
      "learning_rate": 0.00039588763589419156,
      "loss": 2.748570556640625,
      "step": 2350
    },
    {
      "epoch": 6.539877300613497,
      "grad_norm": 1.5286723375320435,
      "learning_rate": 0.0003873063666420535,
      "loss": 2.6635064697265625,
      "step": 2400
    },
    {
      "epoch": 6.676209952283572,
      "grad_norm": 1.5245234966278076,
      "learning_rate": 0.00037864631743537395,
      "loss": 2.556291046142578,
      "step": 2450
    },
    {
      "epoch": 6.812542603953647,
      "grad_norm": 1.4000400304794312,
      "learning_rate": 0.000369915302575779,
      "loss": 2.4817964172363283,
      "step": 2500
    },
    {
      "epoch": 6.948875255623722,
      "grad_norm": 1.3968268632888794,
      "learning_rate": 0.00036112120039998323,
      "loss": 2.362508087158203,
      "step": 2550
    },
    {
      "epoch": 7.084526244035446,
      "grad_norm": 1.6692546606063843,
      "learning_rate": 0.0003522719461708582,
      "loss": 2.273824005126953,
      "step": 2600
    },
    {
      "epoch": 7.220858895705521,
      "grad_norm": 1.4489309787750244,
      "learning_rate": 0.00034337552491713324,
      "loss": 2.1658897399902344,
      "step": 2650
    },
    {
      "epoch": 7.357191547375597,
      "grad_norm": 1.5687353610992432,
      "learning_rate": 0.00033443996422819145,
      "loss": 2.108182220458984,
      "step": 2700
    },
    {
      "epoch": 7.493524199045671,
      "grad_norm": 1.6350905895233154,
      "learning_rate": 0.00032547332701046195,
      "loss": 1.99987060546875,
      "step": 2750
    },
    {
      "epoch": 7.629856850715746,
      "grad_norm": 1.5019129514694214,
      "learning_rate": 0.0003164837042119428,
      "loss": 1.9454510498046875,
      "step": 2800
    },
    {
      "epoch": 7.766189502385822,
      "grad_norm": 1.4423465728759766,
      "learning_rate": 0.00030747920752142186,
      "loss": 1.9158531188964845,
      "step": 2850
    },
    {
      "epoch": 7.902522154055896,
      "grad_norm": 1.5868362188339233,
      "learning_rate": 0.0002984679620489827,
      "loss": 1.8568917846679687,
      "step": 2900
    },
    {
      "epoch": 8.03817314246762,
      "grad_norm": 1.7355551719665527,
      "learning_rate": 0.0002894580989943989,
      "loss": 1.7664053344726562,
      "step": 2950
    },
    {
      "epoch": 8.174505794137696,
      "grad_norm": 1.4344327449798584,
      "learning_rate": 0.0002804577483100344,
      "loss": 1.6748054504394532,
      "step": 3000
    },
    {
      "epoch": 8.310838445807772,
      "grad_norm": 1.6083476543426514,
      "learning_rate": 0.00027147503136486895,
      "loss": 1.6389869689941405,
      "step": 3050
    },
    {
      "epoch": 8.447171097477845,
      "grad_norm": 1.412381649017334,
      "learning_rate": 0.0002625180536162685,
      "loss": 1.6107588195800782,
      "step": 3100
    },
    {
      "epoch": 8.583503749147921,
      "grad_norm": 1.4404499530792236,
      "learning_rate": 0.00025359489729611366,
      "loss": 1.558354034423828,
      "step": 3150
    },
    {
      "epoch": 8.719836400817996,
      "grad_norm": 1.394539713859558,
      "learning_rate": 0.0002447136141178857,
      "loss": 1.5231396484375,
      "step": 3200
    },
    {
      "epoch": 8.85616905248807,
      "grad_norm": 1.4844084978103638,
      "learning_rate": 0.00023588221801128917,
      "loss": 1.4771731567382813,
      "step": 3250
    },
    {
      "epoch": 8.992501704158146,
      "grad_norm": 1.3957374095916748,
      "learning_rate": 0.0002271086778909701,
      "loss": 1.4401710510253907,
      "step": 3300
    },
    {
      "epoch": 9.12815269256987,
      "grad_norm": 1.4386154413223267,
      "learning_rate": 0.00021840091046585182,
      "loss": 1.3497396850585937,
      "step": 3350
    },
    {
      "epoch": 9.264485344239946,
      "grad_norm": 1.4959100484848022,
      "learning_rate": 0.000209766773095578,
      "loss": 1.3368931579589844,
      "step": 3400
    },
    {
      "epoch": 9.400817995910021,
      "grad_norm": 1.3249437808990479,
      "learning_rate": 0.00020121405670051008,
      "loss": 1.297091064453125,
      "step": 3450
    },
    {
      "epoch": 9.537150647580095,
      "grad_norm": 1.3749561309814453,
      "learning_rate": 0.00019275047873167374,
      "loss": 1.260106658935547,
      "step": 3500
    },
    {
      "epoch": 9.67348329925017,
      "grad_norm": 1.4010766744613647,
      "learning_rate": 0.0001843836762070014,
      "loss": 1.239128646850586,
      "step": 3550
    },
    {
      "epoch": 9.809815950920246,
      "grad_norm": 1.5308102369308472,
      "learning_rate": 0.00017612119882015126,
      "loss": 1.1977056121826173,
      "step": 3600
    },
    {
      "epoch": 9.94614860259032,
      "grad_norm": 1.3873751163482666,
      "learning_rate": 0.00016797050212812275,
      "loss": 1.1842040252685546,
      "step": 3650
    },
    {
      "epoch": 10.081799591002046,
      "grad_norm": 1.3666012287139893,
      "learning_rate": 0.00015993894082381616,
      "loss": 1.1095658111572266,
      "step": 3700
    },
    {
      "epoch": 10.21813224267212,
      "grad_norm": 1.3528972864151,
      "learning_rate": 0.00015203376209960474,
      "loss": 1.103120346069336,
      "step": 3750
    },
    {
      "epoch": 10.354464894342195,
      "grad_norm": 1.3081281185150146,
      "learning_rate": 0.00014426209910790887,
      "loss": 1.0691104125976563,
      "step": 3800
    },
    {
      "epoch": 10.49079754601227,
      "grad_norm": 1.3515572547912598,
      "learning_rate": 0.00013663096452467343,
      "loss": 1.0644143676757813,
      "step": 3850
    },
    {
      "epoch": 10.627130197682344,
      "grad_norm": 1.2935131788253784,
      "learning_rate": 0.00012914724422155598,
      "loss": 1.0334495544433593,
      "step": 3900
    },
    {
      "epoch": 10.76346284935242,
      "grad_norm": 1.3209459781646729,
      "learning_rate": 0.00012181769105253435,
      "loss": 1.0103805541992188,
      "step": 3950
    },
    {
      "epoch": 10.899795501022496,
      "grad_norm": 1.324385643005371,
      "learning_rate": 0.00011464891876054252,
      "loss": 0.990460433959961,
      "step": 4000
    },
    {
      "epoch": 11.03544648943422,
      "grad_norm": 1.374879002571106,
      "learning_rate": 0.00010764739600963116,
      "loss": 0.9643755340576172,
      "step": 4050
    },
    {
      "epoch": 11.171779141104295,
      "grad_norm": 1.275993824005127,
      "learning_rate": 0.00010081944054803842,
      "loss": 0.936119155883789,
      "step": 4100
    },
    {
      "epoch": 11.308111792774369,
      "grad_norm": 1.2590258121490479,
      "learning_rate": 9.417121350743844e-05,
      "loss": 0.9281440734863281,
      "step": 4150
    },
    {
      "epoch": 11.444444444444445,
      "grad_norm": 1.2088381052017212,
      "learning_rate": 8.770871384351085e-05,
      "loss": 0.9070972442626953,
      "step": 4200
    }
  ],
  "logging_steps": 50,
  "max_steps": 5505,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 15,
  "save_steps": 200,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 9.989600444122399e+17,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}