Training in progress, step 40000, checkpoint

Browse files

Files changed (8) hide show

last-checkpoint/model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scaler.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/tokenizer.json +1 -1
last-checkpoint/trainer_state.json +277 -589
last-checkpoint/training_args.bin +1 -1

last-checkpoint/model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5615f5c89d013fb49a779606b4469fbfcb42b508a247fb0de81f625f228ffd29
 size 641630264

 version https://git-lfs.github.com/spec/v1
+oid sha256:195846c2c0d0878f568436856fcb31115980b919fbde547b24fd8f1b6904de93
 size 641630264

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58d8d2bf92817390f11f67b44f65ad4a6149f1dc2868ba468dfdda581838aee0
 size 1283324282

 version https://git-lfs.github.com/spec/v1
+oid sha256:569405fc74cee783b05c60cf3d0725086154f7c45fc14f63f78350b6f91fba50
 size 1283324282

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:496c96b90a901bbe63b7d97d586df49c4bac3dd99421ba33e49cfafd2c9c454f
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbe7fbcb6a233a005419711cf60f8760e1079db7e2f205f22b817a7876d6841c
 size 14244

last-checkpoint/scaler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be749ce6c2e646403dd9eb54cb2041d931d8fa4ed3faa66f55822cd781662848
 size 988

 version https://git-lfs.github.com/spec/v1
+oid sha256:996e482d7113d1552c854585970963491dff89aace5a33b5bc82a19f32014cec
 size 988

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f09268d48053182ee7910b62b4a5ba89f63f97f1266f9d30b5fbad29e0d8898a
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:433bb109a71cc77a49d19d0f78be4460568488238c75b6770695e9c96428597d
 size 1064

last-checkpoint/tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3326df9ec64ce5af70e1eff5ba7070351a3c08ddec98cac0c7b843f58fad66ec
 size 10959617

 version https://git-lfs.github.com/spec/v1
+oid sha256:4d95d07520d0d46f70d42b3d0908ce844501ca2d13d2b479131e76473cc6b3bf
 size 10959617

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,954 +2,642 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 1.4898688915375446,
   "eval_steps": 5000,
-  "global_step": 60000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.012415574096146206,
-      "grad_norm": 2.5686280727386475,
-      "learning_rate": 0.00025,
-      "loss": 7.7629,
       "step": 500
     },
     {
       "epoch": 0.024831148192292412,
-      "grad_norm": 2.2654430866241455,
-      "learning_rate": 0.0005,
-      "loss": 6.0856,
       "step": 1000
     },
     {
       "epoch": 0.037246722288438616,
-      "grad_norm": 2.7833986282348633,
-      "learning_rate": 0.0004987522459572769,
-      "loss": 5.6864,
       "step": 1500
     },
     {
       "epoch": 0.049662296384584824,
-      "grad_norm": 2.7468013763427734,
-      "learning_rate": 0.0004975044919145539,
-      "loss": 5.3895,
       "step": 2000
     },
     {
       "epoch": 0.06207787048073103,
-      "grad_norm": 2.8749406337738037,
-      "learning_rate": 0.0004962567378718307,
-      "loss": 5.186,
       "step": 2500
     },
     {
       "epoch": 0.07449344457687723,
-      "grad_norm": 2.867882490158081,
-      "learning_rate": 0.0004950089838291076,
-      "loss": 4.977,
       "step": 3000
     },
     {
       "epoch": 0.08690901867302345,
-      "grad_norm": 3.050034523010254,
-      "learning_rate": 0.0004937612297863845,
-      "loss": 4.7687,
       "step": 3500
     },
     {
       "epoch": 0.09932459276916965,
-      "grad_norm": 4.819639682769775,
-      "learning_rate": 0.0004925134757436613,
-      "loss": 4.4337,
       "step": 4000
     },
     {
       "epoch": 0.11174016686531585,
-      "grad_norm": 2.481872081756592,
-      "learning_rate": 0.0004912657217009383,
-      "loss": 4.0712,
       "step": 4500
     },
     {
       "epoch": 0.12415574096146206,
-      "grad_norm": 2.837972402572632,
-      "learning_rate": 0.0004900179676582152,
-      "loss": 3.915,
       "step": 5000
     },
     {
       "epoch": 0.12415574096146206,
-      "eval_loss": 2.701279878616333,
-      "eval_runtime": 3666.7471,
-      "eval_samples_per_second": 351.453,
-      "eval_steps_per_second": 10.983,
       "step": 5000
     },
     {
       "epoch": 0.13657131505760825,
-      "grad_norm": 3.349369525909424,
-      "learning_rate": 0.0004887702136154921,
-      "loss": 3.7757,
       "step": 5500
     },
     {
       "epoch": 0.14898688915375446,
-      "grad_norm": 3.129056930541992,
-      "learning_rate": 0.00048752245957276905,
-      "loss": 3.7123,
       "step": 6000
     },
     {
       "epoch": 0.16140246324990068,
-      "grad_norm": 3.715869426727295,
-      "learning_rate": 0.00048627969654621686,
-      "loss": 3.6079,
       "step": 6500
     },
     {
       "epoch": 0.1738180373460469,
-      "grad_norm": 2.6967413425445557,
-      "learning_rate": 0.0004850319425034937,
-      "loss": 3.5496,
       "step": 7000
     },
     {
       "epoch": 0.18623361144219308,
-      "grad_norm": 2.7264204025268555,
-      "learning_rate": 0.0004837841884607706,
-      "loss": 3.431,
       "step": 7500
     },
     {
       "epoch": 0.1986491855383393,
-      "grad_norm": 2.5806965827941895,
-      "learning_rate": 0.00048253643441804753,
-      "loss": 3.384,
       "step": 8000
     },
     {
       "epoch": 0.2110647596344855,
-      "grad_norm": 3.236356019973755,
-      "learning_rate": 0.0004812886803753244,
-      "loss": 3.3063,
       "step": 8500
     },
     {
       "epoch": 0.2234803337306317,
-      "grad_norm": 3.126569986343384,
-      "learning_rate": 0.00048004342184068677,
-      "loss": 3.2365,
       "step": 9000
     },
     {
       "epoch": 0.2358959078267779,
-      "grad_norm": 2.903918981552124,
-      "learning_rate": 0.0004787956677979637,
-      "loss": 3.1995,
       "step": 9500
     },
     {
       "epoch": 0.24831148192292413,
-      "grad_norm": 2.595036745071411,
-      "learning_rate": 0.00047754791375524057,
-      "loss": 3.1525,
       "step": 10000
     },
     {
       "epoch": 0.24831148192292413,
-      "eval_loss": 2.1151881217956543,
-      "eval_runtime": 3661.5842,
-      "eval_samples_per_second": 351.949,
-      "eval_steps_per_second": 10.999,
       "step": 10000
     },
     {
       "epoch": 0.2607270560190703,
-      "grad_norm": 2.7406485080718994,
-      "learning_rate": 0.0004763001597125175,
-      "loss": 3.1375,
       "step": 10500
     },
     {
       "epoch": 0.2731426301152165,
-      "grad_norm": 3.198718309402466,
-      "learning_rate": 0.0004750549011778798,
-      "loss": 3.0598,
       "step": 11000
     },
     {
       "epoch": 0.28555820421136274,
-      "grad_norm": 2.6438426971435547,
-      "learning_rate": 0.00047380714713515673,
-      "loss": 3.0657,
       "step": 11500
     },
     {
       "epoch": 0.29797377830750893,
-      "grad_norm": 3.145714282989502,
-      "learning_rate": 0.0004725593930924336,
-      "loss": 3.003,
       "step": 12000
     },
     {
       "epoch": 0.31038935240365517,
-      "grad_norm": 3.4619131088256836,
-      "learning_rate": 0.00047131413455779597,
-      "loss": 2.9664,
       "step": 12500
     },
     {
       "epoch": 0.32280492649980136,
-      "grad_norm": 2.8198635578155518,
-      "learning_rate": 0.0004700663805150729,
-      "loss": 2.9595,
       "step": 13000
     },
     {
       "epoch": 0.33522050059594755,
-      "grad_norm": 3.9205424785614014,
-      "learning_rate": 0.0004688186264723498,
-      "loss": 2.9262,
       "step": 13500
     },
     {
       "epoch": 0.3476360746920938,
-      "grad_norm": 3.130042791366577,
-      "learning_rate": 0.0004675708724296267,
-      "loss": 2.8829,
       "step": 14000
     },
     {
       "epoch": 0.36005164878824,
-      "grad_norm": 3.2414395809173584,
-      "learning_rate": 0.0004663231183869036,
-      "loss": 2.8598,
       "step": 14500
     },
     {
       "epoch": 0.37246722288438616,
-      "grad_norm": 3.659555196762085,
-      "learning_rate": 0.00046507536434418044,
-      "loss": 2.8592,
       "step": 15000
     },
     {
       "epoch": 0.37246722288438616,
-      "eval_loss": 1.8211588859558105,
-      "eval_runtime": 3647.4211,
-      "eval_samples_per_second": 353.316,
-      "eval_steps_per_second": 11.041,
       "step": 15000
     },
     {
       "epoch": 0.3848827969805324,
-      "grad_norm": 3.733859062194824,
-      "learning_rate": 0.0004638301058095428,
-      "loss": 2.8209,
       "step": 15500
     },
     {
       "epoch": 0.3972983710766786,
-      "grad_norm": 2.9978246688842773,
-      "learning_rate": 0.00046258235176681973,
-      "loss": 2.7854,
       "step": 16000
     },
     {
       "epoch": 0.4097139451728248,
-      "grad_norm": 2.694765567779541,
-      "learning_rate": 0.00046133459772409666,
-      "loss": 2.789,
       "step": 16500
     },
     {
       "epoch": 0.422129519268971,
-      "grad_norm": 3.022148370742798,
-      "learning_rate": 0.00046008684368137353,
-      "loss": 2.7819,
       "step": 17000
     },
     {
       "epoch": 0.4345450933651172,
-      "grad_norm": 2.756038188934326,
-      "learning_rate": 0.00045883908963865045,
-      "loss": 2.7496,
       "step": 17500
     },
     {
       "epoch": 0.4469606674612634,
-      "grad_norm": 3.0430989265441895,
-      "learning_rate": 0.0004575913355959274,
-      "loss": 2.7558,
       "step": 18000
     },
     {
       "epoch": 0.45937624155740964,
-      "grad_norm": 2.710583209991455,
-      "learning_rate": 0.0004563460770612897,
-      "loss": 2.7214,
       "step": 18500
     },
     {
       "epoch": 0.4717918156535558,
-      "grad_norm": 4.814529895782471,
-      "learning_rate": 0.00045509832301856656,
-      "loss": 2.6977,
       "step": 19000
     },
     {
       "epoch": 0.484207389749702,
-      "grad_norm": 2.782024621963501,
-      "learning_rate": 0.0004538505689758435,
-      "loss": 2.7066,
       "step": 19500
     },
     {
       "epoch": 0.49662296384584825,
-      "grad_norm": 2.9942479133605957,
-      "learning_rate": 0.00045260281493312036,
-      "loss": 2.6744,
       "step": 20000
     },
     {
       "epoch": 0.49662296384584825,
-      "eval_loss": 1.681386947631836,
-      "eval_runtime": 3650.0754,
-      "eval_samples_per_second": 353.059,
-      "eval_steps_per_second": 11.033,
       "step": 20000
     },
     {
       "epoch": 0.5090385379419944,
-      "grad_norm": 2.6107559204101562,
-      "learning_rate": 0.00045135755639848273,
-      "loss": 2.6627,
       "step": 20500
     },
     {
       "epoch": 0.5214541120381406,
-      "grad_norm": 3.603623390197754,
-      "learning_rate": 0.00045010980235575966,
-      "loss": 2.6374,
       "step": 21000
     },
     {
       "epoch": 0.5338696861342869,
-      "grad_norm": 2.804776668548584,
-      "learning_rate": 0.0004488620483130366,
-      "loss": 2.6477,
       "step": 21500
     },
     {
       "epoch": 0.546285260230433,
-      "grad_norm": 3.2368860244750977,
-      "learning_rate": 0.00044761429427031345,
-      "loss": 2.6416,
       "step": 22000
     },
     {
       "epoch": 0.5587008343265792,
-      "grad_norm": 2.6095378398895264,
-      "learning_rate": 0.0004463665402275903,
-      "loss": 2.6248,
       "step": 22500
     },
     {
       "epoch": 0.5711164084227255,
-      "grad_norm": 2.9860754013061523,
-      "learning_rate": 0.0004451212816929527,
-      "loss": 2.6254,
       "step": 23000
     },
     {
       "epoch": 0.5835319825188717,
-      "grad_norm": 3.114459276199341,
-      "learning_rate": 0.0004438735276502296,
-      "loss": 2.6257,
       "step": 23500
     },
     {
       "epoch": 0.5959475566150179,
-      "grad_norm": 2.812556028366089,
-      "learning_rate": 0.0004426257736075065,
-      "loss": 2.5746,
       "step": 24000
     },
     {
       "epoch": 0.6083631307111641,
-      "grad_norm": 3.2355823516845703,
-      "learning_rate": 0.0004413780195647834,
-      "loss": 2.5919,
       "step": 24500
     },
     {
       "epoch": 0.6207787048073103,
-      "grad_norm": 2.5201354026794434,
-      "learning_rate": 0.0004401302655220603,
-      "loss": 2.5754,
       "step": 25000
     },
     {
       "epoch": 0.6207787048073103,
-      "eval_loss": 1.5900601148605347,
-      "eval_runtime": 4174.4112,
-      "eval_samples_per_second": 308.712,
-      "eval_steps_per_second": 9.647,
       "step": 25000
     },
     {
       "epoch": 0.6331942789034565,
-      "grad_norm": 2.8540596961975098,
-      "learning_rate": 0.0004388825114793372,
-      "loss": 2.5705,
       "step": 25500
     },
     {
       "epoch": 0.6456098529996027,
-      "grad_norm": 2.603358507156372,
-      "learning_rate": 0.00043763475743661414,
-      "loss": 2.5342,
       "step": 26000
     },
     {
       "epoch": 0.658025427095749,
-      "grad_norm": 2.7852208614349365,
-      "learning_rate": 0.00043638700339389096,
-      "loss": 2.5463,
       "step": 26500
     },
     {
       "epoch": 0.6704410011918951,
-      "grad_norm": 2.7578940391540527,
-      "learning_rate": 0.0004351392493511679,
-      "loss": 2.5372,
       "step": 27000
     },
     {
       "epoch": 0.6828565752880413,
-      "grad_norm": 2.941049337387085,
-      "learning_rate": 0.00043389399081653025,
-      "loss": 2.5207,
       "step": 27500
     },
     {
       "epoch": 0.6952721493841876,
-      "grad_norm": 2.7455787658691406,
-      "learning_rate": 0.0004326462367738072,
-      "loss": 2.5233,
       "step": 28000
     },
     {
       "epoch": 0.7076877234803337,
-      "grad_norm": 2.4482600688934326,
-      "learning_rate": 0.00043139848273108405,
-      "loss": 2.5105,
       "step": 28500
     },
     {
       "epoch": 0.72010329757648,
-      "grad_norm": 2.8398752212524414,
-      "learning_rate": 0.000430150728688361,
-      "loss": 2.531,
       "step": 29000
     },
     {
       "epoch": 0.7325188716726262,
-      "grad_norm": 2.608999013900757,
-      "learning_rate": 0.00042890547015372334,
-      "loss": 2.4864,
       "step": 29500
     },
     {
       "epoch": 0.7449344457687723,
-      "grad_norm": 2.071620225906372,
-      "learning_rate": 0.00042765771611100016,
-      "loss": 2.4574,
       "step": 30000
     },
     {
       "epoch": 0.7449344457687723,
-      "eval_loss": 1.5331941843032837,
-      "eval_runtime": 4179.1286,
-      "eval_samples_per_second": 308.364,
-      "eval_steps_per_second": 9.636,
       "step": 30000
     },
     {
       "epoch": 0.7573500198649186,
-      "grad_norm": 3.0172479152679443,
-      "learning_rate": 0.0004264099620682771,
-      "loss": 2.4733,
       "step": 30500
     },
     {
       "epoch": 0.7697655939610648,
-      "grad_norm": 2.6325442790985107,
-      "learning_rate": 0.000425162208025554,
-      "loss": 2.4721,
       "step": 31000
     },
     {
       "epoch": 0.7821811680572109,
-      "grad_norm": 2.826345682144165,
-      "learning_rate": 0.0004239144539828309,
-      "loss": 2.4692,
       "step": 31500
     },
     {
       "epoch": 0.7945967421533572,
-      "grad_norm": 2.456289291381836,
-      "learning_rate": 0.00042266919544819325,
-      "loss": 2.4385,
       "step": 32000
     },
     {
       "epoch": 0.8070123162495034,
-      "grad_norm": 2.4803292751312256,
-      "learning_rate": 0.0004214214414054702,
-      "loss": 2.439,
       "step": 32500
     },
     {
       "epoch": 0.8194278903456496,
-      "grad_norm": 2.6469247341156006,
-      "learning_rate": 0.00042017618287083254,
-      "loss": 2.4729,
       "step": 33000
     },
     {
       "epoch": 0.8318434644417958,
-      "grad_norm": 2.7024786472320557,
-      "learning_rate": 0.0004189284288281094,
-      "loss": 2.4244,
       "step": 33500
     },
     {
       "epoch": 0.844259038537942,
-      "grad_norm": 2.847285270690918,
-      "learning_rate": 0.0004176806747853863,
-      "loss": 2.4636,
       "step": 34000
     },
     {
       "epoch": 0.8566746126340882,
-      "grad_norm": 2.453200340270996,
-      "learning_rate": 0.0004164329207426632,
-      "loss": 2.4524,
       "step": 34500
     },
     {
       "epoch": 0.8690901867302344,
-      "grad_norm": 2.49642276763916,
-      "learning_rate": 0.0004151876622080256,
-      "loss": 2.4457,
       "step": 35000
     },
     {
       "epoch": 0.8690901867302344,
-      "eval_loss": 1.4776599407196045,
-      "eval_runtime": 4154.52,
-      "eval_samples_per_second": 310.19,
-      "eval_steps_per_second": 9.694,
       "step": 35000
     },
     {
       "epoch": 0.8815057608263807,
-      "grad_norm": 2.576984405517578,
-      "learning_rate": 0.00041393990816530245,
-      "loss": 2.4149,
       "step": 35500
     },
     {
       "epoch": 0.8939213349225268,
-      "grad_norm": 3.0729165077209473,
-      "learning_rate": 0.0004126921541225794,
-      "loss": 2.4067,
       "step": 36000
     },
     {
       "epoch": 0.906336909018673,
-      "grad_norm": 2.7619829177856445,
-      "learning_rate": 0.0004114444000798563,
-      "loss": 2.4121,
       "step": 36500
     },
     {
       "epoch": 0.9187524831148193,
-      "grad_norm": 3.5316452980041504,
-      "learning_rate": 0.0004101991415452186,
-      "loss": 2.3781,
       "step": 37000
     },
     {
       "epoch": 0.9311680572109654,
-      "grad_norm": 2.7174599170684814,
-      "learning_rate": 0.0004089538830105809,
-      "loss": 2.4013,
       "step": 37500
     },
     {
       "epoch": 0.9435836313071116,
-      "grad_norm": 13.372625350952148,
-      "learning_rate": 0.00040771611100019967,
-      "loss": 2.5449,
       "step": 38000
     },
     {
       "epoch": 0.9559992054032579,
-      "grad_norm": 11.173745155334473,
-      "learning_rate": 0.0004064883210221601,
-      "loss": 6.7867,
       "step": 38500
     },
     {
       "epoch": 0.968414779499404,
-      "grad_norm": 0.9608703255653381,
-      "learning_rate": 0.000405240566979437,
-      "loss": 7.7097,
       "step": 39000
     },
     {
       "epoch": 0.9808303535955503,
-      "grad_norm": 8286.201171875,
-      "learning_rate": 0.0004039928129367139,
-      "loss": 7.7291,
       "step": 39500
     },
     {
       "epoch": 0.9932459276916965,
-      "grad_norm": 2.1965973377227783,
-      "learning_rate": 0.00040274505889399085,
-      "loss": 7.7205,
       "step": 40000
     },
     {
       "epoch": 0.9932459276916965,
-      "eval_loss": 7.322892665863037,
-      "eval_runtime": 4170.5156,
-      "eval_samples_per_second": 309.0,
-      "eval_steps_per_second": 9.656,
       "step": 40000
-    },
-    {
-      "epoch": 1.0056615017878427,
-      "grad_norm": 4.976968288421631,
-      "learning_rate": 0.0004014973048512677,
-      "loss": 7.671,
-      "step": 40500
-    },
-    {
-      "epoch": 1.0180770758839888,
-      "grad_norm": 430.2960510253906,
-      "learning_rate": 0.00040024955080854464,
-      "loss": 7.6052,
-      "step": 41000
-    },
-    {
-      "epoch": 1.030492649980135,
-      "grad_norm": 76.06390380859375,
-      "learning_rate": 0.00039900179676582157,
-      "loss": 7.6507,
-      "step": 41500
-    },
-    {
-      "epoch": 1.0429082240762813,
-      "grad_norm": 75.03479766845703,
-      "learning_rate": 0.00039776152924735477,
-      "loss": 7.6393,
-      "step": 42000
-    },
-    {
-      "epoch": 1.0553237981724275,
-      "grad_norm": 99.86361694335938,
-      "learning_rate": 0.00039651876622080257,
-      "loss": 7.6949,
-      "step": 42500
-    },
-    {
-      "epoch": 1.0677393722685737,
-      "grad_norm": 1295.1463623046875,
-      "learning_rate": 0.0003952710121780795,
-      "loss": 8.0558,
-      "step": 43000
-    },
-    {
-      "epoch": 1.08015494636472,
-      "grad_norm": 3921.91357421875,
-      "learning_rate": 0.00039402325813535637,
-      "loss": 8.1452,
-      "step": 43500
-    },
-    {
-      "epoch": 1.0925705204608662,
-      "grad_norm": 2463.47265625,
-      "learning_rate": 0.0003927755040926333,
-      "loss": 8.6431,
-      "step": 44000
-    },
-    {
-      "epoch": 1.1049860945570122,
-      "grad_norm": 259.3988952636719,
-      "learning_rate": 0.0003915302455579956,
-      "loss": 8.3183,
-      "step": 44500
-    },
-    {
-      "epoch": 1.1174016686531585,
-      "grad_norm": 1.8363823890686035,
-      "learning_rate": 0.00039028249151527254,
-      "loss": 8.5829,
-      "step": 45000
-    },
-    {
-      "epoch": 1.1174016686531585,
-      "eval_loss": 8.133321762084961,
-      "eval_runtime": 4160.7924,
-      "eval_samples_per_second": 309.722,
-      "eval_steps_per_second": 9.679,
-      "step": 45000
-    },
-    {
-      "epoch": 1.1298172427493047,
-      "grad_norm": 18.962547302246094,
-      "learning_rate": 0.0003890347374725494,
-      "loss": 8.762,
-      "step": 45500
-    },
-    {
-      "epoch": 1.142232816845451,
-      "grad_norm": 41.705020904541016,
-      "learning_rate": 0.00038778698342982633,
-      "loss": 8.8213,
-      "step": 46000
-    },
-    {
-      "epoch": 1.1546483909415972,
-      "grad_norm": 17.964086532592773,
-      "learning_rate": 0.0003865392293871032,
-      "loss": 8.9416,
-      "step": 46500
-    },
-    {
-      "epoch": 1.1670639650377432,
-      "grad_norm": 363.1439208984375,
-      "learning_rate": 0.00038529147534438013,
-      "loss": 8.9453,
-      "step": 47000
-    },
-    {
-      "epoch": 1.1794795391338895,
-      "grad_norm": 5445.716796875,
-      "learning_rate": 0.0003840462168097425,
-      "loss": 8.9454,
-      "step": 47500
-    },
-    {
-      "epoch": 1.1918951132300357,
-      "grad_norm": 6571.228515625,
-      "learning_rate": 0.00038279846276701937,
-      "loss": 8.9851,
-      "step": 48000
-    },
-    {
-      "epoch": 1.204310687326182,
-      "grad_norm": 4.845749378204346,
-      "learning_rate": 0.00038155070872429624,
-      "loss": 9.1135,
-      "step": 48500
-    },
-    {
-      "epoch": 1.2167262614223282,
-      "grad_norm": 5.5228142738342285,
-      "learning_rate": 0.00038030295468157317,
-      "loss": 9.5186,
-      "step": 49000
-    },
-    {
-      "epoch": 1.2291418355184744,
-      "grad_norm": 3.5042662620544434,
-      "learning_rate": 0.0003790552006388501,
-      "loss": 9.8645,
-      "step": 49500
-    },
-    {
-      "epoch": 1.2415574096146207,
-      "grad_norm": 15.4348726272583,
-      "learning_rate": 0.00037780744659612696,
-      "loss": 9.7882,
-      "step": 50000
-    },
-    {
-      "epoch": 1.2415574096146207,
-      "eval_loss": 9.363153457641602,
-      "eval_runtime": 4159.7498,
-      "eval_samples_per_second": 309.8,
-      "eval_steps_per_second": 9.681,
-      "step": 50000
-    },
-    {
-      "epoch": 1.2539729837107667,
-      "grad_norm": 4.296143054962158,
-      "learning_rate": 0.0003765596925534039,
-      "loss": 9.7354,
-      "step": 50500
-    },
-    {
-      "epoch": 1.266388557806913,
-      "grad_norm": 4.641263484954834,
-      "learning_rate": 0.0003753119385106808,
-      "loss": 9.7796,
-      "step": 51000
-    },
-    {
-      "epoch": 1.2788041319030592,
-      "grad_norm": 4.560667514801025,
-      "learning_rate": 0.0003740641844679577,
-      "loss": 9.8683,
-      "step": 51500
-    },
-    {
-      "epoch": 1.2912197059992054,
-      "grad_norm": 4.787716388702393,
-      "learning_rate": 0.0003728164304252346,
-      "loss": 9.7906,
-      "step": 52000
-    },
-    {
-      "epoch": 1.3036352800953517,
-      "grad_norm": 4.268510818481445,
-      "learning_rate": 0.0003715686763825115,
-      "loss": 9.8016,
-      "step": 52500
-    },
-    {
-      "epoch": 1.3160508541914977,
-      "grad_norm": 4.434477806091309,
-      "learning_rate": 0.00037032092233978836,
-      "loss": 9.7858,
-      "step": 53000
-    },
-    {
-      "epoch": 1.328466428287644,
-      "grad_norm": 25.949443817138672,
-      "learning_rate": 0.0003690731682970653,
-      "loss": 9.7222,
-      "step": 53500
-    },
-    {
-      "epoch": 1.3408820023837902,
-      "grad_norm": 61.10898208618164,
-      "learning_rate": 0.0003678254142543422,
-      "loss": 9.8307,
-      "step": 54000
-    },
-    {
-      "epoch": 1.3532975764799364,
-      "grad_norm": 7.8839430809021,
-      "learning_rate": 0.0003665776602116191,
-      "loss": 9.6664,
-      "step": 54500
-    },
-    {
-      "epoch": 1.3657131505760827,
-      "grad_norm": 27.914098739624023,
-      "learning_rate": 0.000365329906168896,
-      "loss": 9.5602,
-      "step": 55000
-    },
-    {
-      "epoch": 1.3657131505760827,
-      "eval_loss": 9.656608581542969,
-      "eval_runtime": 4188.0736,
-      "eval_samples_per_second": 307.705,
-      "eval_steps_per_second": 9.616,
-      "step": 55000
-    },
-    {
-      "epoch": 1.378128724672229,
-      "grad_norm": 259.0064392089844,
-      "learning_rate": 0.0003640821521261729,
-      "loss": 9.6525,
-      "step": 55500
-    },
-    {
-      "epoch": 1.3905442987683752,
-      "grad_norm": 7.888017654418945,
-      "learning_rate": 0.0003628343980834498,
-      "loss": 9.5104,
-      "step": 56000
-    },
-    {
-      "epoch": 1.4029598728645212,
-      "grad_norm": 7.025816440582275,
-      "learning_rate": 0.00036158664404072673,
-      "loss": 9.6077,
-      "step": 56500
-    },
-    {
-      "epoch": 1.4153754469606674,
-      "grad_norm": 41.72800827026367,
-      "learning_rate": 0.0003603388899980036,
-      "loss": 9.6558,
-      "step": 57000
-    },
-    {
-      "epoch": 1.4277910210568137,
-      "grad_norm": 289.7976989746094,
-      "learning_rate": 0.0003590911359552805,
-      "loss": 9.6123,
-      "step": 57500
-    },
-    {
-      "epoch": 1.44020659515296,
-      "grad_norm": 9.890583038330078,
-      "learning_rate": 0.0003578433819125574,
-      "loss": 9.4366,
-      "step": 58000
-    },
-    {
-      "epoch": 1.4526221692491061,
-      "grad_norm": 405.3219299316406,
-      "learning_rate": 0.00035659812337791977,
-      "loss": 9.3668,
-      "step": 58500
-    },
-    {
-      "epoch": 1.4650377433452522,
-      "grad_norm": 6.443118572235107,
-      "learning_rate": 0.00035535036933519664,
-      "loss": 9.4349,
-      "step": 59000
-    },
-    {
-      "epoch": 1.4774533174413986,
-      "grad_norm": 8.514484405517578,
-      "learning_rate": 0.00035410261529247357,
-      "loss": 9.3594,
-      "step": 59500
-    },
-    {
-      "epoch": 1.4898688915375446,
-      "grad_norm": 11.674294471740723,
-      "learning_rate": 0.0003528548612497505,
-      "loss": 9.3252,
-      "step": 60000
-    },
-    {
-      "epoch": 1.4898688915375446,
-      "eval_loss": 9.275012969970703,
-      "eval_runtime": 4173.4545,
-      "eval_samples_per_second": 308.783,
-      "eval_steps_per_second": 9.65,
-      "step": 60000
     }
   ],
   "logging_steps": 500,
-  "max_steps": 201360,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 5,
   "save_steps": 10000,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -963,7 +651,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.5209002073613517e+17,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9932459276916965,
   "eval_steps": 5000,
+  "global_step": 40000,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
       "epoch": 0.012415574096146206,
+      "grad_norm": 34.79914474487305,
+      "learning_rate": 0.000246,
+      "loss": 8.4873,
       "step": 500
     },
     {
       "epoch": 0.024831148192292412,
+      "grad_norm": 6.984533786773682,
+      "learning_rate": 0.0004955,
+      "loss": 8.152,
       "step": 1000
     },
     {
       "epoch": 0.037246722288438616,
+      "grad_norm": 18.275636672973633,
+      "learning_rate": 0.0004984664684423567,
+      "loss": 8.3503,
       "step": 1500
     },
     {
       "epoch": 0.049662296384584824,
+      "grad_norm": 8.112704277038574,
+      "learning_rate": 0.0004969141971915447,
+      "loss": 8.1588,
       "step": 2000
     },
     {
       "epoch": 0.06207787048073103,
+      "grad_norm": 371.90625,
+      "learning_rate": 0.0004953525560941483,
+      "loss": 7.9764,
       "step": 2500
     },
     {
       "epoch": 0.07449344457687723,
+      "grad_norm": 164.8943634033203,
+      "learning_rate": 0.0004937940382789466,
+      "loss": 7.8648,
       "step": 3000
     },
     {
       "epoch": 0.08690901867302345,
+      "grad_norm": 119.9703140258789,
+      "learning_rate": 0.0004922323971815501,
+      "loss": 8.0131,
       "step": 3500
     },
     {
       "epoch": 0.09932459276916965,
+      "grad_norm": 1504.8216552734375,
+      "learning_rate": 0.0004906738793663485,
+      "loss": 8.0822,
       "step": 4000
     },
     {
       "epoch": 0.11174016686531585,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 10.2917,
       "step": 4500
     },
     {
       "epoch": 0.12415574096146206,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 5000
     },
     {
       "epoch": 0.12415574096146206,
+      "eval_loss": NaN,
+      "eval_runtime": 3598.9752,
+      "eval_samples_per_second": 358.072,
+      "eval_steps_per_second": 11.19,
       "step": 5000
     },
     {
       "epoch": 0.13657131505760825,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 5500
     },
     {
       "epoch": 0.14898688915375446,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 6000
     },
     {
       "epoch": 0.16140246324990068,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 6500
     },
     {
       "epoch": 0.1738180373460469,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 7000
     },
     {
       "epoch": 0.18623361144219308,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 7500
     },
     {
       "epoch": 0.1986491855383393,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 8000
     },
     {
       "epoch": 0.2110647596344855,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 8500
     },
     {
       "epoch": 0.2234803337306317,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 9000
     },
     {
       "epoch": 0.2358959078267779,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 9500
     },
     {
       "epoch": 0.24831148192292413,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 10000
     },
     {
       "epoch": 0.24831148192292413,
+      "eval_loss": NaN,
+      "eval_runtime": 3702.896,
+      "eval_samples_per_second": 348.022,
+      "eval_steps_per_second": 10.876,
       "step": 10000
     },
     {
       "epoch": 0.2607270560190703,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 10500
     },
     {
       "epoch": 0.2731426301152165,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 11000
     },
     {
       "epoch": 0.28555820421136274,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 11500
     },
     {
       "epoch": 0.29797377830750893,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 12000
     },
     {
       "epoch": 0.31038935240365517,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 12500
     },
     {
       "epoch": 0.32280492649980136,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 13000
     },
     {
       "epoch": 0.33522050059594755,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 13500
     },
     {
       "epoch": 0.3476360746920938,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 14000
     },
     {
       "epoch": 0.36005164878824,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 14500
     },
     {
       "epoch": 0.37246722288438616,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 15000
     },
     {
       "epoch": 0.37246722288438616,
+      "eval_loss": NaN,
+      "eval_runtime": 3721.6303,
+      "eval_samples_per_second": 346.271,
+      "eval_steps_per_second": 10.821,
       "step": 15000
     },
     {
       "epoch": 0.3848827969805324,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 15500
     },
     {
       "epoch": 0.3972983710766786,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 16000
     },
     {
       "epoch": 0.4097139451728248,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 16500
     },
     {
       "epoch": 0.422129519268971,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 17000
     },
     {
       "epoch": 0.4345450933651172,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 17500
     },
     {
       "epoch": 0.4469606674612634,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 18000
     },
     {
       "epoch": 0.45937624155740964,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 18500
     },
     {
       "epoch": 0.4717918156535558,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 19000
     },
     {
       "epoch": 0.484207389749702,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 19500
     },
     {
       "epoch": 0.49662296384584825,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 20000
     },
     {
       "epoch": 0.49662296384584825,
+      "eval_loss": NaN,
+      "eval_runtime": 3334.043,
+      "eval_samples_per_second": 386.525,
+      "eval_steps_per_second": 12.079,
       "step": 20000
     },
     {
       "epoch": 0.5090385379419944,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 20500
     },
     {
       "epoch": 0.5214541120381406,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 21000
     },
     {
       "epoch": 0.5338696861342869,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 21500
     },
     {
       "epoch": 0.546285260230433,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 22000
     },
     {
       "epoch": 0.5587008343265792,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 22500
     },
     {
       "epoch": 0.5711164084227255,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 23000
     },
     {
       "epoch": 0.5835319825188717,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 23500
     },
     {
       "epoch": 0.5959475566150179,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 24000
     },
     {
       "epoch": 0.6083631307111641,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 24500
     },
     {
       "epoch": 0.6207787048073103,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 25000
     },
     {
       "epoch": 0.6207787048073103,
+      "eval_loss": NaN,
+      "eval_runtime": 3318.6898,
+      "eval_samples_per_second": 388.313,
+      "eval_steps_per_second": 12.135,
       "step": 25000
     },
     {
       "epoch": 0.6331942789034565,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 25500
     },
     {
       "epoch": 0.6456098529996027,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 26000
     },
     {
       "epoch": 0.658025427095749,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 26500
     },
     {
       "epoch": 0.6704410011918951,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 27000
     },
     {
       "epoch": 0.6828565752880413,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 27500
     },
     {
       "epoch": 0.6952721493841876,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 28000
     },
     {
       "epoch": 0.7076877234803337,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 28500
     },
     {
       "epoch": 0.72010329757648,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 29000
     },
     {
       "epoch": 0.7325188716726262,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 29500
     },
     {
       "epoch": 0.7449344457687723,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 30000
     },
     {
       "epoch": 0.7449344457687723,
+      "eval_loss": NaN,
+      "eval_runtime": 3329.5874,
+      "eval_samples_per_second": 387.042,
+      "eval_steps_per_second": 12.095,
       "step": 30000
     },
     {
       "epoch": 0.7573500198649186,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 30500
     },
     {
       "epoch": 0.7697655939610648,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 31000
     },
     {
       "epoch": 0.7821811680572109,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 31500
     },
     {
       "epoch": 0.7945967421533572,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 32000
     },
     {
       "epoch": 0.8070123162495034,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 32500
     },
     {
       "epoch": 0.8194278903456496,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 33000
     },
     {
       "epoch": 0.8318434644417958,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 33500
     },
     {
       "epoch": 0.844259038537942,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 34000
     },
     {
       "epoch": 0.8566746126340882,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 34500
     },
     {
       "epoch": 0.8690901867302344,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 35000
     },
     {
       "epoch": 0.8690901867302344,
+      "eval_loss": NaN,
+      "eval_runtime": 3340.4008,
+      "eval_samples_per_second": 385.789,
+      "eval_steps_per_second": 12.056,
       "step": 35000
     },
     {
       "epoch": 0.8815057608263807,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 35500
     },
     {
       "epoch": 0.8939213349225268,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 36000
     },
     {
       "epoch": 0.906336909018673,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 36500
     },
     {
       "epoch": 0.9187524831148193,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 37000
     },
     {
       "epoch": 0.9311680572109654,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 37500
     },
     {
       "epoch": 0.9435836313071116,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 38000
     },
     {
       "epoch": 0.9559992054032579,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 38500
     },
     {
       "epoch": 0.968414779499404,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 39000
     },
     {
       "epoch": 0.9808303535955503,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 39500
     },
     {
       "epoch": 0.9932459276916965,
+      "grad_norm": NaN,
+      "learning_rate": 0.0004900742091849483,
+      "loss": 0.0,
       "step": 40000
     },
     {
       "epoch": 0.9932459276916965,
+      "eval_loss": NaN,
+      "eval_runtime": 3403.8564,
+      "eval_samples_per_second": 378.597,
+      "eval_steps_per_second": 11.831,
       "step": 40000
     }
   ],
   "logging_steps": 500,
+  "max_steps": 161088,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
   "save_steps": 10000,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 8.494745877441331e+16,
   "train_batch_size": 32,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3113451d9893929f8ff3855bcb2647209eee528a14890b35d215742603e4dc5a
 size 5368

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4801bd5b148520d0075cf5afe1e0c45a70e3939c841013d91a0d072e265ffbd
 size 5368