Training in progress, step 600, checkpoint

Browse files

Files changed (7) hide show

last-checkpoint/adapter_config.json +5 -5
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/rng_state.pth +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +1413 -5
last-checkpoint/training_args.bin +1 -1

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -20,13 +20,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "o_proj",
     "q_proj",
-    "k_proj",
-    "down_proj",
-    "gate_proj",
     "v_proj",
-    "up_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "q_proj",
     "v_proj",
+    "gate_proj",
+    "down_proj",
+    "o_proj",
+    "up_proj",
+    "k_proj"
   ],
   "task_type": "CAUSAL_LM",
   "use_dora": false,

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9961be420a9f5b1ece0bf8352ac72f0825cae7d693fa0e49096ffb7c6fd9d324
 size 639691872

 version https://git-lfs.github.com/spec/v1
+oid sha256:4b323ad4be24049867110298a5696d7a4d3bc1285b5d9abb776fd44bb62c4e7e
 size 639691872

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e27e9ec9ac3798574a12ad5881eb724672736596feeec845aa325cf61868ceae
 size 1279647314

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f743f776850133223224bc723df1ecc1783afc6f39100d552a3269e6ec930b9
 size 1279647314

last-checkpoint/rng_state.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13af9b7c642e074997a9bdbae88ba56e792139bc8360f4873ff633fa70291205
 size 14244

 version https://git-lfs.github.com/spec/v1
+oid sha256:a3fb2209e718294a208c6cc709af7b887a1d8670e8f1a4bc2f4f906958b18e41
 size 14244

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6dd3cfeea20c315ac503882fc53b8b9eae9cdcaf6dd61efec7bfa78209d384fd
 size 1064

 version https://git-lfs.github.com/spec/v1
+oid sha256:337154a73245c7e602fb3be659a2a2f9a9857c7eb27089f66eef80ad815c4899
 size 1064

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "best_metric": 1.6627388000488281,
-  "best_model_checkpoint": "miner_id_24/checkpoint-400",
-  "epoch": 0.2157715518694582,
   "eval_steps": 200,
-  "global_step": 400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2831,6 +2831,1414 @@
       "eval_samples_per_second": 2.12,
       "eval_steps_per_second": 2.12,
       "step": 400
     }
   ],
   "logging_steps": 1,
@@ -2859,7 +4267,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 7.146078851574006e+17,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
+  "best_metric": 1.5770864486694336,
+  "best_model_checkpoint": "miner_id_24/checkpoint-600",
+  "epoch": 0.3236573278041873,
   "eval_steps": 200,
+  "global_step": 600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "eval_samples_per_second": 2.12,
       "eval_steps_per_second": 2.12,
       "step": 400
+    },
+    {
+      "epoch": 0.21631098074913185,
+      "grad_norm": 24.1074161529541,
+      "learning_rate": 0.00019850345310169155,
+      "loss": 37.3797,
+      "step": 401
+    },
+    {
+      "epoch": 0.2168504096288055,
+      "grad_norm": 62.604949951171875,
+      "learning_rate": 0.00019849351783439561,
+      "loss": 78.7953,
+      "step": 402
+    },
+    {
+      "epoch": 0.21738983850847915,
+      "grad_norm": 43.36476135253906,
+      "learning_rate": 0.0001984835499473072,
+      "loss": 82.645,
+      "step": 403
+    },
+    {
+      "epoch": 0.2179292673881528,
+      "grad_norm": 52.12046432495117,
+      "learning_rate": 0.0001984735494437275,
+      "loss": 87.0839,
+      "step": 404
+    },
+    {
+      "epoch": 0.21846869626782645,
+      "grad_norm": 34.333431243896484,
+      "learning_rate": 0.00019846351632696863,
+      "loss": 105.6289,
+      "step": 405
+    },
+    {
+      "epoch": 0.2190081251475001,
+      "grad_norm": 41.665771484375,
+      "learning_rate": 0.00019845345060035335,
+      "loss": 112.3874,
+      "step": 406
+    },
+    {
+      "epoch": 0.21954755402717374,
+      "grad_norm": 58.79914093017578,
+      "learning_rate": 0.00019844335226721537,
+      "loss": 114.2657,
+      "step": 407
+    },
+    {
+      "epoch": 0.22008698290684736,
+      "grad_norm": 52.85742950439453,
+      "learning_rate": 0.00019843322133089906,
+      "loss": 98.4778,
+      "step": 408
+    },
+    {
+      "epoch": 0.220626411786521,
+      "grad_norm": 53.792476654052734,
+      "learning_rate": 0.00019842305779475968,
+      "loss": 94.7811,
+      "step": 409
+    },
+    {
+      "epoch": 0.22116584066619466,
+      "grad_norm": 49.56667709350586,
+      "learning_rate": 0.0001984128616621633,
+      "loss": 92.4516,
+      "step": 410
+    },
+    {
+      "epoch": 0.2217052695458683,
+      "grad_norm": 38.96401596069336,
+      "learning_rate": 0.0001984026329364867,
+      "loss": 78.0561,
+      "step": 411
+    },
+    {
+      "epoch": 0.22224469842554195,
+      "grad_norm": 35.649200439453125,
+      "learning_rate": 0.00019839237162111757,
+      "loss": 66.0612,
+      "step": 412
+    },
+    {
+      "epoch": 0.2227841273052156,
+      "grad_norm": 22.54837989807129,
+      "learning_rate": 0.00019838207771945426,
+      "loss": 59.3091,
+      "step": 413
+    },
+    {
+      "epoch": 0.22332355618488925,
+      "grad_norm": 16.843589782714844,
+      "learning_rate": 0.00019837175123490596,
+      "loss": 62.8711,
+      "step": 414
+    },
+    {
+      "epoch": 0.2238629850645629,
+      "grad_norm": 18.909435272216797,
+      "learning_rate": 0.00019836139217089275,
+      "loss": 55.3784,
+      "step": 415
+    },
+    {
+      "epoch": 0.22440241394423655,
+      "grad_norm": 25.120887756347656,
+      "learning_rate": 0.0001983510005308454,
+      "loss": 51.9063,
+      "step": 416
+    },
+    {
+      "epoch": 0.2249418428239102,
+      "grad_norm": 30.78650665283203,
+      "learning_rate": 0.00019834057631820543,
+      "loss": 32.4726,
+      "step": 417
+    },
+    {
+      "epoch": 0.22548127170358384,
+      "grad_norm": 72.46208953857422,
+      "learning_rate": 0.00019833011953642525,
+      "loss": 44.1452,
+      "step": 418
+    },
+    {
+      "epoch": 0.2260207005832575,
+      "grad_norm": 45.94267654418945,
+      "learning_rate": 0.000198319630188968,
+      "loss": 50.9596,
+      "step": 419
+    },
+    {
+      "epoch": 0.2265601294629311,
+      "grad_norm": 47.52016067504883,
+      "learning_rate": 0.00019830910827930764,
+      "loss": 44.8286,
+      "step": 420
+    },
+    {
+      "epoch": 0.22709955834260476,
+      "grad_norm": 40.93891525268555,
+      "learning_rate": 0.00019829855381092886,
+      "loss": 56.7985,
+      "step": 421
+    },
+    {
+      "epoch": 0.2276389872222784,
+      "grad_norm": 36.567108154296875,
+      "learning_rate": 0.0001982879667873272,
+      "loss": 35.7161,
+      "step": 422
+    },
+    {
+      "epoch": 0.22817841610195205,
+      "grad_norm": 31.908977508544922,
+      "learning_rate": 0.0001982773472120089,
+      "loss": 42.8407,
+      "step": 423
+    },
+    {
+      "epoch": 0.2287178449816257,
+      "grad_norm": 37.47427749633789,
+      "learning_rate": 0.00019826669508849108,
+      "loss": 39.5264,
+      "step": 424
+    },
+    {
+      "epoch": 0.22925727386129935,
+      "grad_norm": 43.83090591430664,
+      "learning_rate": 0.00019825601042030156,
+      "loss": 48.5415,
+      "step": 425
+    },
+    {
+      "epoch": 0.229796702740973,
+      "grad_norm": 42.004425048828125,
+      "learning_rate": 0.00019824529321097893,
+      "loss": 39.4127,
+      "step": 426
+    },
+    {
+      "epoch": 0.23033613162064664,
+      "grad_norm": 38.282066345214844,
+      "learning_rate": 0.00019823454346407267,
+      "loss": 40.8499,
+      "step": 427
+    },
+    {
+      "epoch": 0.2308755605003203,
+      "grad_norm": 33.92627716064453,
+      "learning_rate": 0.0001982237611831429,
+      "loss": 35.4472,
+      "step": 428
+    },
+    {
+      "epoch": 0.23141498937999394,
+      "grad_norm": 53.361106872558594,
+      "learning_rate": 0.00019821294637176057,
+      "loss": 43.1921,
+      "step": 429
+    },
+    {
+      "epoch": 0.2319544182596676,
+      "grad_norm": 40.92842102050781,
+      "learning_rate": 0.00019820209903350744,
+      "loss": 36.5019,
+      "step": 430
+    },
+    {
+      "epoch": 0.2324938471393412,
+      "grad_norm": 35.71042251586914,
+      "learning_rate": 0.00019819121917197602,
+      "loss": 36.598,
+      "step": 431
+    },
+    {
+      "epoch": 0.23303327601901486,
+      "grad_norm": 35.10508728027344,
+      "learning_rate": 0.00019818030679076952,
+      "loss": 31.6675,
+      "step": 432
+    },
+    {
+      "epoch": 0.2335727048986885,
+      "grad_norm": 31.885364532470703,
+      "learning_rate": 0.00019816936189350206,
+      "loss": 34.3554,
+      "step": 433
+    },
+    {
+      "epoch": 0.23411213377836215,
+      "grad_norm": 42.998878479003906,
+      "learning_rate": 0.0001981583844837984,
+      "loss": 28.1099,
+      "step": 434
+    },
+    {
+      "epoch": 0.2346515626580358,
+      "grad_norm": 38.70567321777344,
+      "learning_rate": 0.00019814737456529412,
+      "loss": 42.3567,
+      "step": 435
+    },
+    {
+      "epoch": 0.23519099153770945,
+      "grad_norm": 34.43855285644531,
+      "learning_rate": 0.00019813633214163555,
+      "loss": 22.8285,
+      "step": 436
+    },
+    {
+      "epoch": 0.2357304204173831,
+      "grad_norm": 33.38055419921875,
+      "learning_rate": 0.00019812525721647986,
+      "loss": 36.1465,
+      "step": 437
+    },
+    {
+      "epoch": 0.23626984929705674,
+      "grad_norm": 42.98970413208008,
+      "learning_rate": 0.00019811414979349485,
+      "loss": 34.8416,
+      "step": 438
+    },
+    {
+      "epoch": 0.2368092781767304,
+      "grad_norm": 37.12187957763672,
+      "learning_rate": 0.0001981030098763592,
+      "loss": 34.276,
+      "step": 439
+    },
+    {
+      "epoch": 0.23734870705640404,
+      "grad_norm": 44.36403274536133,
+      "learning_rate": 0.00019809183746876232,
+      "loss": 30.3544,
+      "step": 440
+    },
+    {
+      "epoch": 0.2378881359360777,
+      "grad_norm": 46.281654357910156,
+      "learning_rate": 0.00019808063257440432,
+      "loss": 27.8803,
+      "step": 441
+    },
+    {
+      "epoch": 0.23842756481575134,
+      "grad_norm": 49.94664001464844,
+      "learning_rate": 0.00019806939519699613,
+      "loss": 31.0358,
+      "step": 442
+    },
+    {
+      "epoch": 0.23896699369542496,
+      "grad_norm": 42.308616638183594,
+      "learning_rate": 0.0001980581253402595,
+      "loss": 29.4053,
+      "step": 443
+    },
+    {
+      "epoch": 0.2395064225750986,
+      "grad_norm": 51.36742401123047,
+      "learning_rate": 0.00019804682300792674,
+      "loss": 31.0947,
+      "step": 444
+    },
+    {
+      "epoch": 0.24004585145477225,
+      "grad_norm": 40.25013732910156,
+      "learning_rate": 0.00019803548820374113,
+      "loss": 26.6703,
+      "step": 445
+    },
+    {
+      "epoch": 0.2405852803344459,
+      "grad_norm": 53.013710021972656,
+      "learning_rate": 0.00019802412093145657,
+      "loss": 35.5286,
+      "step": 446
+    },
+    {
+      "epoch": 0.24112470921411955,
+      "grad_norm": 41.21833038330078,
+      "learning_rate": 0.00019801272119483775,
+      "loss": 25.3315,
+      "step": 447
+    },
+    {
+      "epoch": 0.2416641380937932,
+      "grad_norm": 61.56970977783203,
+      "learning_rate": 0.00019800128899766017,
+      "loss": 27.589,
+      "step": 448
+    },
+    {
+      "epoch": 0.24220356697346684,
+      "grad_norm": 58.22453308105469,
+      "learning_rate": 0.00019798982434371,
+      "loss": 37.2235,
+      "step": 449
+    },
+    {
+      "epoch": 0.2427429958531405,
+      "grad_norm": 36.04716110229492,
+      "learning_rate": 0.00019797832723678413,
+      "loss": 28.1485,
+      "step": 450
+    },
+    {
+      "epoch": 0.24328242473281414,
+      "grad_norm": 50.804813385009766,
+      "learning_rate": 0.00019796679768069032,
+      "loss": 49.1471,
+      "step": 451
+    },
+    {
+      "epoch": 0.2438218536124878,
+      "grad_norm": 91.2785873413086,
+      "learning_rate": 0.00019795523567924702,
+      "loss": 72.8998,
+      "step": 452
+    },
+    {
+      "epoch": 0.24436128249216144,
+      "grad_norm": 110.37539672851562,
+      "learning_rate": 0.00019794364123628335,
+      "loss": 98.2308,
+      "step": 453
+    },
+    {
+      "epoch": 0.24490071137183506,
+      "grad_norm": 79.3825912475586,
+      "learning_rate": 0.00019793201435563932,
+      "loss": 109.7274,
+      "step": 454
+    },
+    {
+      "epoch": 0.2454401402515087,
+      "grad_norm": 36.62171173095703,
+      "learning_rate": 0.00019792035504116555,
+      "loss": 107.5116,
+      "step": 455
+    },
+    {
+      "epoch": 0.24597956913118235,
+      "grad_norm": 57.664146423339844,
+      "learning_rate": 0.00019790866329672346,
+      "loss": 113.5622,
+      "step": 456
+    },
+    {
+      "epoch": 0.246518998010856,
+      "grad_norm": 57.12027359008789,
+      "learning_rate": 0.00019789693912618524,
+      "loss": 102.4627,
+      "step": 457
+    },
+    {
+      "epoch": 0.24705842689052965,
+      "grad_norm": 67.92241668701172,
+      "learning_rate": 0.00019788518253343376,
+      "loss": 90.2483,
+      "step": 458
+    },
+    {
+      "epoch": 0.2475978557702033,
+      "grad_norm": 63.95331573486328,
+      "learning_rate": 0.00019787339352236264,
+      "loss": 94.7671,
+      "step": 459
+    },
+    {
+      "epoch": 0.24813728464987694,
+      "grad_norm": 55.70960235595703,
+      "learning_rate": 0.00019786157209687627,
+      "loss": 92.1523,
+      "step": 460
+    },
+    {
+      "epoch": 0.2486767135295506,
+      "grad_norm": 44.270233154296875,
+      "learning_rate": 0.00019784971826088973,
+      "loss": 82.3084,
+      "step": 461
+    },
+    {
+      "epoch": 0.24921614240922424,
+      "grad_norm": 35.74955749511719,
+      "learning_rate": 0.0001978378320183289,
+      "loss": 71.401,
+      "step": 462
+    },
+    {
+      "epoch": 0.2497555712888979,
+      "grad_norm": 26.20838165283203,
+      "learning_rate": 0.00019782591337313035,
+      "loss": 68.6018,
+      "step": 463
+    },
+    {
+      "epoch": 0.25029500016857154,
+      "grad_norm": 20.70208740234375,
+      "learning_rate": 0.00019781396232924133,
+      "loss": 62.6257,
+      "step": 464
+    },
+    {
+      "epoch": 0.25083442904824516,
+      "grad_norm": 17.804771423339844,
+      "learning_rate": 0.00019780197889061993,
+      "loss": 54.6564,
+      "step": 465
+    },
+    {
+      "epoch": 0.25137385792791883,
+      "grad_norm": 24.327360153198242,
+      "learning_rate": 0.0001977899630612349,
+      "loss": 50.7451,
+      "step": 466
+    },
+    {
+      "epoch": 0.25191328680759245,
+      "grad_norm": 29.580142974853516,
+      "learning_rate": 0.00019777791484506567,
+      "loss": 34.4045,
+      "step": 467
+    },
+    {
+      "epoch": 0.2524527156872661,
+      "grad_norm": 30.99888801574707,
+      "learning_rate": 0.00019776583424610254,
+      "loss": 41.2975,
+      "step": 468
+    },
+    {
+      "epoch": 0.25299214456693975,
+      "grad_norm": 40.59465408325195,
+      "learning_rate": 0.0001977537212683464,
+      "loss": 56.0607,
+      "step": 469
+    },
+    {
+      "epoch": 0.2535315734466134,
+      "grad_norm": 42.85790252685547,
+      "learning_rate": 0.00019774157591580894,
+      "loss": 40.9168,
+      "step": 470
+    },
+    {
+      "epoch": 0.25407100232628704,
+      "grad_norm": 38.090885162353516,
+      "learning_rate": 0.0001977293981925125,
+      "loss": 49.6262,
+      "step": 471
+    },
+    {
+      "epoch": 0.25461043120596066,
+      "grad_norm": 33.007991790771484,
+      "learning_rate": 0.0001977171881024902,
+      "loss": 44.5241,
+      "step": 472
+    },
+    {
+      "epoch": 0.25514986008563434,
+      "grad_norm": 39.41592025756836,
+      "learning_rate": 0.00019770494564978595,
+      "loss": 38.185,
+      "step": 473
+    },
+    {
+      "epoch": 0.25568928896530796,
+      "grad_norm": 33.008148193359375,
+      "learning_rate": 0.00019769267083845417,
+      "loss": 42.3843,
+      "step": 474
+    },
+    {
+      "epoch": 0.25622871784498163,
+      "grad_norm": 27.917991638183594,
+      "learning_rate": 0.0001976803636725602,
+      "loss": 33.7216,
+      "step": 475
+    },
+    {
+      "epoch": 0.25676814672465526,
+      "grad_norm": 29.870256423950195,
+      "learning_rate": 0.00019766802415617998,
+      "loss": 35.7963,
+      "step": 476
+    },
+    {
+      "epoch": 0.25730757560432893,
+      "grad_norm": 44.98633575439453,
+      "learning_rate": 0.0001976556522934002,
+      "loss": 35.8127,
+      "step": 477
+    },
+    {
+      "epoch": 0.25784700448400255,
+      "grad_norm": 43.03909683227539,
+      "learning_rate": 0.0001976432480883183,
+      "loss": 35.4111,
+      "step": 478
+    },
+    {
+      "epoch": 0.2583864333636762,
+      "grad_norm": 47.32424545288086,
+      "learning_rate": 0.00019763081154504234,
+      "loss": 41.8895,
+      "step": 479
+    },
+    {
+      "epoch": 0.25892586224334985,
+      "grad_norm": 49.7735595703125,
+      "learning_rate": 0.0001976183426676912,
+      "loss": 32.9801,
+      "step": 480
+    },
+    {
+      "epoch": 0.2594652911230235,
+      "grad_norm": 44.57673645019531,
+      "learning_rate": 0.0001976058414603944,
+      "loss": 36.089,
+      "step": 481
+    },
+    {
+      "epoch": 0.26000472000269714,
+      "grad_norm": 36.22349548339844,
+      "learning_rate": 0.00019759330792729212,
+      "loss": 47.0487,
+      "step": 482
+    },
+    {
+      "epoch": 0.26054414888237076,
+      "grad_norm": 38.58706283569336,
+      "learning_rate": 0.00019758074207253535,
+      "loss": 34.3672,
+      "step": 483
+    },
+    {
+      "epoch": 0.26108357776204444,
+      "grad_norm": 40.61176300048828,
+      "learning_rate": 0.00019756814390028575,
+      "loss": 39.7468,
+      "step": 484
+    },
+    {
+      "epoch": 0.26162300664171806,
+      "grad_norm": 29.439836502075195,
+      "learning_rate": 0.00019755551341471566,
+      "loss": 34.1449,
+      "step": 485
+    },
+    {
+      "epoch": 0.26216243552139173,
+      "grad_norm": 35.68241882324219,
+      "learning_rate": 0.00019754285062000815,
+      "loss": 31.6102,
+      "step": 486
+    },
+    {
+      "epoch": 0.26270186440106535,
+      "grad_norm": 44.2021598815918,
+      "learning_rate": 0.000197530155520357,
+      "loss": 31.8889,
+      "step": 487
+    },
+    {
+      "epoch": 0.26324129328073903,
+      "grad_norm": 53.82715606689453,
+      "learning_rate": 0.00019751742811996656,
+      "loss": 31.6853,
+      "step": 488
+    },
+    {
+      "epoch": 0.26378072216041265,
+      "grad_norm": 41.77256774902344,
+      "learning_rate": 0.00019750466842305208,
+      "loss": 39.1939,
+      "step": 489
+    },
+    {
+      "epoch": 0.2643201510400863,
+      "grad_norm": 36.42414093017578,
+      "learning_rate": 0.00019749187643383937,
+      "loss": 26.3978,
+      "step": 490
+    },
+    {
+      "epoch": 0.26485957991975995,
+      "grad_norm": 49.238014221191406,
+      "learning_rate": 0.00019747905215656498,
+      "loss": 33.8181,
+      "step": 491
+    },
+    {
+      "epoch": 0.2653990087994336,
+      "grad_norm": 37.46484375,
+      "learning_rate": 0.00019746619559547619,
+      "loss": 32.0879,
+      "step": 492
+    },
+    {
+      "epoch": 0.26593843767910724,
+      "grad_norm": 29.428075790405273,
+      "learning_rate": 0.00019745330675483084,
+      "loss": 22.5194,
+      "step": 493
+    },
+    {
+      "epoch": 0.2664778665587809,
+      "grad_norm": 42.24260330200195,
+      "learning_rate": 0.00019744038563889764,
+      "loss": 34.5577,
+      "step": 494
+    },
+    {
+      "epoch": 0.26701729543845454,
+      "grad_norm": 43.271976470947266,
+      "learning_rate": 0.00019742743225195582,
+      "loss": 25.107,
+      "step": 495
+    },
+    {
+      "epoch": 0.26755672431812816,
+      "grad_norm": 41.1341667175293,
+      "learning_rate": 0.00019741444659829543,
+      "loss": 24.4596,
+      "step": 496
+    },
+    {
+      "epoch": 0.26809615319780183,
+      "grad_norm": 35.3587760925293,
+      "learning_rate": 0.00019740142868221713,
+      "loss": 21.1434,
+      "step": 497
+    },
+    {
+      "epoch": 0.26863558207747545,
+      "grad_norm": 47.48214340209961,
+      "learning_rate": 0.00019738837850803226,
+      "loss": 23.4752,
+      "step": 498
+    },
+    {
+      "epoch": 0.26917501095714913,
+      "grad_norm": 44.637882232666016,
+      "learning_rate": 0.00019737529608006293,
+      "loss": 21.9525,
+      "step": 499
+    },
+    {
+      "epoch": 0.26971443983682275,
+      "grad_norm": 31.005287170410156,
+      "learning_rate": 0.00019736218140264185,
+      "loss": 19.1622,
+      "step": 500
+    },
+    {
+      "epoch": 0.2702538687164964,
+      "grad_norm": 32.10681915283203,
+      "learning_rate": 0.0001973490344801124,
+      "loss": 44.8021,
+      "step": 501
+    },
+    {
+      "epoch": 0.27079329759617005,
+      "grad_norm": 67.818603515625,
+      "learning_rate": 0.0001973358553168287,
+      "loss": 90.5945,
+      "step": 502
+    },
+    {
+      "epoch": 0.2713327264758437,
+      "grad_norm": 78.30387115478516,
+      "learning_rate": 0.00019732264391715556,
+      "loss": 101.037,
+      "step": 503
+    },
+    {
+      "epoch": 0.27187215535551734,
+      "grad_norm": 92.50519561767578,
+      "learning_rate": 0.00019730940028546835,
+      "loss": 124.3723,
+      "step": 504
+    },
+    {
+      "epoch": 0.272411584235191,
+      "grad_norm": 38.794246673583984,
+      "learning_rate": 0.0001972961244261532,
+      "loss": 105.1317,
+      "step": 505
+    },
+    {
+      "epoch": 0.27295101311486464,
+      "grad_norm": 34.56374740600586,
+      "learning_rate": 0.00019728281634360698,
+      "loss": 101.3536,
+      "step": 506
+    },
+    {
+      "epoch": 0.27349044199453826,
+      "grad_norm": 33.79701614379883,
+      "learning_rate": 0.00019726947604223712,
+      "loss": 105.4946,
+      "step": 507
+    },
+    {
+      "epoch": 0.27402987087421193,
+      "grad_norm": 39.242740631103516,
+      "learning_rate": 0.00019725610352646172,
+      "loss": 82.6645,
+      "step": 508
+    },
+    {
+      "epoch": 0.27456929975388555,
+      "grad_norm": 41.144683837890625,
+      "learning_rate": 0.0001972426988007096,
+      "loss": 99.5104,
+      "step": 509
+    },
+    {
+      "epoch": 0.27510872863355923,
+      "grad_norm": 43.32292175292969,
+      "learning_rate": 0.00019722926186942026,
+      "loss": 90.6068,
+      "step": 510
+    },
+    {
+      "epoch": 0.27564815751323285,
+      "grad_norm": 40.97383117675781,
+      "learning_rate": 0.0001972157927370438,
+      "loss": 71.8933,
+      "step": 511
+    },
+    {
+      "epoch": 0.2761875863929065,
+      "grad_norm": 27.89875602722168,
+      "learning_rate": 0.0001972022914080411,
+      "loss": 66.0499,
+      "step": 512
+    },
+    {
+      "epoch": 0.27672701527258015,
+      "grad_norm": 23.75403594970703,
+      "learning_rate": 0.00019718875788688354,
+      "loss": 59.9798,
+      "step": 513
+    },
+    {
+      "epoch": 0.2772664441522538,
+      "grad_norm": 18.101530075073242,
+      "learning_rate": 0.0001971751921780533,
+      "loss": 55.1379,
+      "step": 514
+    },
+    {
+      "epoch": 0.27780587303192744,
+      "grad_norm": 24.123146057128906,
+      "learning_rate": 0.00019716159428604315,
+      "loss": 51.0036,
+      "step": 515
+    },
+    {
+      "epoch": 0.2783453019116011,
+      "grad_norm": 29.12915802001953,
+      "learning_rate": 0.00019714796421535654,
+      "loss": 35.74,
+      "step": 516
+    },
+    {
+      "epoch": 0.27888473079127474,
+      "grad_norm": 41.40327072143555,
+      "learning_rate": 0.00019713430197050756,
+      "loss": 34.8342,
+      "step": 517
+    },
+    {
+      "epoch": 0.27942415967094836,
+      "grad_norm": 65.70941162109375,
+      "learning_rate": 0.00019712060755602102,
+      "loss": 45.6267,
+      "step": 518
+    },
+    {
+      "epoch": 0.27996358855062203,
+      "grad_norm": 37.733158111572266,
+      "learning_rate": 0.00019710688097643227,
+      "loss": 40.7,
+      "step": 519
+    },
+    {
+      "epoch": 0.28050301743029565,
+      "grad_norm": 39.90540313720703,
+      "learning_rate": 0.0001970931222362874,
+      "loss": 52.105,
+      "step": 520
+    },
+    {
+      "epoch": 0.28104244630996933,
+      "grad_norm": 41.023155212402344,
+      "learning_rate": 0.0001970793313401432,
+      "loss": 47.4019,
+      "step": 521
+    },
+    {
+      "epoch": 0.28158187518964295,
+      "grad_norm": 39.340972900390625,
+      "learning_rate": 0.00019706550829256693,
+      "loss": 36.3784,
+      "step": 522
+    },
+    {
+      "epoch": 0.2821213040693166,
+      "grad_norm": 31.36964988708496,
+      "learning_rate": 0.0001970516530981367,
+      "loss": 32.5883,
+      "step": 523
+    },
+    {
+      "epoch": 0.28266073294899025,
+      "grad_norm": 31.426342010498047,
+      "learning_rate": 0.00019703776576144105,
+      "loss": 37.0281,
+      "step": 524
+    },
+    {
+      "epoch": 0.2832001618286639,
+      "grad_norm": 48.170589447021484,
+      "learning_rate": 0.00019702384628707945,
+      "loss": 50.0541,
+      "step": 525
+    },
+    {
+      "epoch": 0.28373959070833754,
+      "grad_norm": 58.017845153808594,
+      "learning_rate": 0.0001970098946796617,
+      "loss": 35.1185,
+      "step": 526
+    },
+    {
+      "epoch": 0.2842790195880112,
+      "grad_norm": 44.51712417602539,
+      "learning_rate": 0.0001969959109438085,
+      "loss": 30.6861,
+      "step": 527
+    },
+    {
+      "epoch": 0.28481844846768484,
+      "grad_norm": 38.26441955566406,
+      "learning_rate": 0.00019698189508415102,
+      "loss": 42.7979,
+      "step": 528
+    },
+    {
+      "epoch": 0.28535787734735846,
+      "grad_norm": 33.41388702392578,
+      "learning_rate": 0.00019696784710533115,
+      "loss": 31.6934,
+      "step": 529
+    },
+    {
+      "epoch": 0.28589730622703213,
+      "grad_norm": 39.14249038696289,
+      "learning_rate": 0.00019695376701200145,
+      "loss": 31.4034,
+      "step": 530
+    },
+    {
+      "epoch": 0.28643673510670575,
+      "grad_norm": 38.64737319946289,
+      "learning_rate": 0.000196939654808825,
+      "loss": 35.3318,
+      "step": 531
+    },
+    {
+      "epoch": 0.28697616398637943,
+      "grad_norm": 32.65852355957031,
+      "learning_rate": 0.0001969255105004756,
+      "loss": 33.1427,
+      "step": 532
+    },
+    {
+      "epoch": 0.28751559286605305,
+      "grad_norm": 33.65852355957031,
+      "learning_rate": 0.0001969113340916377,
+      "loss": 31.0407,
+      "step": 533
+    },
+    {
+      "epoch": 0.2880550217457267,
+      "grad_norm": 31.496322631835938,
+      "learning_rate": 0.00019689712558700628,
+      "loss": 32.1776,
+      "step": 534
+    },
+    {
+      "epoch": 0.28859445062540034,
+      "grad_norm": 37.255680084228516,
+      "learning_rate": 0.00019688288499128707,
+      "loss": 32.4352,
+      "step": 535
+    },
+    {
+      "epoch": 0.289133879505074,
+      "grad_norm": 35.74131774902344,
+      "learning_rate": 0.00019686861230919635,
+      "loss": 39.0239,
+      "step": 536
+    },
+    {
+      "epoch": 0.28967330838474764,
+      "grad_norm": 62.805694580078125,
+      "learning_rate": 0.00019685430754546107,
+      "loss": 39.168,
+      "step": 537
+    },
+    {
+      "epoch": 0.2902127372644213,
+      "grad_norm": 32.74406814575195,
+      "learning_rate": 0.00019683997070481875,
+      "loss": 27.3064,
+      "step": 538
+    },
+    {
+      "epoch": 0.29075216614409494,
+      "grad_norm": 60.63595199584961,
+      "learning_rate": 0.00019682560179201759,
+      "loss": 37.3217,
+      "step": 539
+    },
+    {
+      "epoch": 0.2912915950237686,
+      "grad_norm": 49.350975036621094,
+      "learning_rate": 0.00019681120081181636,
+      "loss": 32.6254,
+      "step": 540
+    },
+    {
+      "epoch": 0.29183102390344223,
+      "grad_norm": 33.03507614135742,
+      "learning_rate": 0.00019679676776898454,
+      "loss": 23.6142,
+      "step": 541
+    },
+    {
+      "epoch": 0.29237045278311585,
+      "grad_norm": 46.380985260009766,
+      "learning_rate": 0.00019678230266830212,
+      "loss": 26.1048,
+      "step": 542
+    },
+    {
+      "epoch": 0.29290988166278953,
+      "grad_norm": 44.384132385253906,
+      "learning_rate": 0.00019676780551455977,
+      "loss": 19.0745,
+      "step": 543
+    },
+    {
+      "epoch": 0.29344931054246315,
+      "grad_norm": 32.757320404052734,
+      "learning_rate": 0.0001967532763125588,
+      "loss": 33.5921,
+      "step": 544
+    },
+    {
+      "epoch": 0.2939887394221368,
+      "grad_norm": 40.512939453125,
+      "learning_rate": 0.000196738715067111,
+      "loss": 23.9648,
+      "step": 545
+    },
+    {
+      "epoch": 0.29452816830181044,
+      "grad_norm": 36.085330963134766,
+      "learning_rate": 0.00019672412178303898,
+      "loss": 25.8736,
+      "step": 546
+    },
+    {
+      "epoch": 0.2950675971814841,
+      "grad_norm": 39.4991340637207,
+      "learning_rate": 0.00019670949646517576,
+      "loss": 35.8085,
+      "step": 547
+    },
+    {
+      "epoch": 0.29560702606115774,
+      "grad_norm": 56.80205535888672,
+      "learning_rate": 0.0001966948391183651,
+      "loss": 21.2566,
+      "step": 548
+    },
+    {
+      "epoch": 0.2961464549408314,
+      "grad_norm": 51.80792999267578,
+      "learning_rate": 0.00019668014974746133,
+      "loss": 19.3891,
+      "step": 549
+    },
+    {
+      "epoch": 0.29668588382050504,
+      "grad_norm": 40.740726470947266,
+      "learning_rate": 0.00019666542835732937,
+      "loss": 17.442,
+      "step": 550
+    },
+    {
+      "epoch": 0.2972253127001787,
+      "grad_norm": 43.78228759765625,
+      "learning_rate": 0.00019665067495284476,
+      "loss": 53.1444,
+      "step": 551
+    },
+    {
+      "epoch": 0.29776474157985233,
+      "grad_norm": 68.15139770507812,
+      "learning_rate": 0.00019663588953889363,
+      "loss": 83.8455,
+      "step": 552
+    },
+    {
+      "epoch": 0.29830417045952595,
+      "grad_norm": 57.72416305541992,
+      "learning_rate": 0.00019662107212037273,
+      "loss": 91.3314,
+      "step": 553
+    },
+    {
+      "epoch": 0.29884359933919963,
+      "grad_norm": 70.40361785888672,
+      "learning_rate": 0.0001966062227021894,
+      "loss": 115.1381,
+      "step": 554
+    },
+    {
+      "epoch": 0.29938302821887325,
+      "grad_norm": 33.6906623840332,
+      "learning_rate": 0.00019659134128926156,
+      "loss": 96.5649,
+      "step": 555
+    },
+    {
+      "epoch": 0.2999224570985469,
+      "grad_norm": 41.24090576171875,
+      "learning_rate": 0.00019657642788651776,
+      "loss": 104.8012,
+      "step": 556
+    },
+    {
+      "epoch": 0.30046188597822054,
+      "grad_norm": 62.62508773803711,
+      "learning_rate": 0.00019656148249889714,
+      "loss": 89.1584,
+      "step": 557
+    },
+    {
+      "epoch": 0.3010013148578942,
+      "grad_norm": 54.20726013183594,
+      "learning_rate": 0.00019654650513134937,
+      "loss": 102.4601,
+      "step": 558
+    },
+    {
+      "epoch": 0.30154074373756784,
+      "grad_norm": 51.19554138183594,
+      "learning_rate": 0.00019653149578883482,
+      "loss": 94.7273,
+      "step": 559
+    },
+    {
+      "epoch": 0.3020801726172415,
+      "grad_norm": 50.297447204589844,
+      "learning_rate": 0.00019651645447632437,
+      "loss": 85.4999,
+      "step": 560
+    },
+    {
+      "epoch": 0.30261960149691514,
+      "grad_norm": 43.541648864746094,
+      "learning_rate": 0.00019650138119879952,
+      "loss": 84.9936,
+      "step": 561
+    },
+    {
+      "epoch": 0.3031590303765888,
+      "grad_norm": 30.611860275268555,
+      "learning_rate": 0.00019648627596125233,
+      "loss": 68.3871,
+      "step": 562
+    },
+    {
+      "epoch": 0.30369845925626243,
+      "grad_norm": 18.373859405517578,
+      "learning_rate": 0.00019647113876868546,
+      "loss": 64.1806,
+      "step": 563
+    },
+    {
+      "epoch": 0.30423788813593605,
+      "grad_norm": 17.967041015625,
+      "learning_rate": 0.00019645596962611218,
+      "loss": 58.1967,
+      "step": 564
+    },
+    {
+      "epoch": 0.30477731701560973,
+      "grad_norm": 17.57683563232422,
+      "learning_rate": 0.00019644076853855626,
+      "loss": 48.7426,
+      "step": 565
+    },
+    {
+      "epoch": 0.30531674589528335,
+      "grad_norm": 24.4635066986084,
+      "learning_rate": 0.00019642553551105219,
+      "loss": 45.5702,
+      "step": 566
+    },
+    {
+      "epoch": 0.305856174774957,
+      "grad_norm": 44.31038284301758,
+      "learning_rate": 0.0001964102705486449,
+      "loss": 36.4538,
+      "step": 567
+    },
+    {
+      "epoch": 0.30639560365463064,
+      "grad_norm": 45.66762924194336,
+      "learning_rate": 0.00019639497365638993,
+      "loss": 37.6228,
+      "step": 568
+    },
+    {
+      "epoch": 0.3069350325343043,
+      "grad_norm": 45.2806282043457,
+      "learning_rate": 0.00019637964483935346,
+      "loss": 47.7514,
+      "step": 569
+    },
+    {
+      "epoch": 0.30747446141397794,
+      "grad_norm": 44.627296447753906,
+      "learning_rate": 0.00019636428410261218,
+      "loss": 50.5934,
+      "step": 570
+    },
+    {
+      "epoch": 0.3080138902936516,
+      "grad_norm": 39.8631706237793,
+      "learning_rate": 0.00019634889145125336,
+      "loss": 33.2035,
+      "step": 571
+    },
+    {
+      "epoch": 0.30855331917332524,
+      "grad_norm": 43.88326644897461,
+      "learning_rate": 0.00019633346689037486,
+      "loss": 44.4418,
+      "step": 572
+    },
+    {
+      "epoch": 0.3090927480529989,
+      "grad_norm": 31.599515914916992,
+      "learning_rate": 0.0001963180104250851,
+      "loss": 29.8656,
+      "step": 573
+    },
+    {
+      "epoch": 0.30963217693267253,
+      "grad_norm": 29.062061309814453,
+      "learning_rate": 0.00019630252206050307,
+      "loss": 29.4416,
+      "step": 574
+    },
+    {
+      "epoch": 0.31017160581234615,
+      "grad_norm": 35.07856750488281,
+      "learning_rate": 0.00019628700180175833,
+      "loss": 33.663,
+      "step": 575
+    },
+    {
+      "epoch": 0.3107110346920198,
+      "grad_norm": 38.65933609008789,
+      "learning_rate": 0.00019627144965399094,
+      "loss": 43.6982,
+      "step": 576
+    },
+    {
+      "epoch": 0.31125046357169345,
+      "grad_norm": 36.53346252441406,
+      "learning_rate": 0.0001962558656223516,
+      "loss": 41.9741,
+      "step": 577
+    },
+    {
+      "epoch": 0.3117898924513671,
+      "grad_norm": 50.61214065551758,
+      "learning_rate": 0.00019624024971200154,
+      "loss": 31.3103,
+      "step": 578
+    },
+    {
+      "epoch": 0.31232932133104074,
+      "grad_norm": 39.70477294921875,
+      "learning_rate": 0.00019622460192811255,
+      "loss": 40.1001,
+      "step": 579
+    },
+    {
+      "epoch": 0.3128687502107144,
+      "grad_norm": 43.24115753173828,
+      "learning_rate": 0.000196208922275867,
+      "loss": 38.9648,
+      "step": 580
+    },
+    {
+      "epoch": 0.31340817909038804,
+      "grad_norm": 49.614410400390625,
+      "learning_rate": 0.00019619321076045778,
+      "loss": 38.396,
+      "step": 581
+    },
+    {
+      "epoch": 0.3139476079700617,
+      "grad_norm": 38.65335464477539,
+      "learning_rate": 0.0001961774673870883,
+      "loss": 33.8401,
+      "step": 582
+    },
+    {
+      "epoch": 0.31448703684973534,
+      "grad_norm": 36.919837951660156,
+      "learning_rate": 0.00019616169216097262,
+      "loss": 40.8598,
+      "step": 583
+    },
+    {
+      "epoch": 0.315026465729409,
+      "grad_norm": 34.90658187866211,
+      "learning_rate": 0.00019614588508733524,
+      "loss": 26.7875,
+      "step": 584
+    },
+    {
+      "epoch": 0.31556589460908263,
+      "grad_norm": 36.6773796081543,
+      "learning_rate": 0.00019613004617141132,
+      "loss": 38.7512,
+      "step": 585
+    },
+    {
+      "epoch": 0.3161053234887563,
+      "grad_norm": 38.80603790283203,
+      "learning_rate": 0.00019611417541844645,
+      "loss": 22.4567,
+      "step": 586
+    },
+    {
+      "epoch": 0.3166447523684299,
+      "grad_norm": 39.85905838012695,
+      "learning_rate": 0.00019609827283369687,
+      "loss": 34.7722,
+      "step": 587
+    },
+    {
+      "epoch": 0.31718418124810355,
+      "grad_norm": 42.714210510253906,
+      "learning_rate": 0.00019608233842242925,
+      "loss": 29.6514,
+      "step": 588
+    },
+    {
+      "epoch": 0.3177236101277772,
+      "grad_norm": 28.49331283569336,
+      "learning_rate": 0.00019606637218992092,
+      "loss": 32.2811,
+      "step": 589
+    },
+    {
+      "epoch": 0.31826303900745084,
+      "grad_norm": 38.48284912109375,
+      "learning_rate": 0.0001960503741414597,
+      "loss": 19.4347,
+      "step": 590
+    },
+    {
+      "epoch": 0.3188024678871245,
+      "grad_norm": 40.46686553955078,
+      "learning_rate": 0.00019603434428234389,
+      "loss": 36.0755,
+      "step": 591
+    },
+    {
+      "epoch": 0.31934189676679814,
+      "grad_norm": 33.52849578857422,
+      "learning_rate": 0.00019601828261788236,
+      "loss": 23.4967,
+      "step": 592
+    },
+    {
+      "epoch": 0.3198813256464718,
+      "grad_norm": 36.89003372192383,
+      "learning_rate": 0.0001960021891533946,
+      "loss": 17.4822,
+      "step": 593
+    },
+    {
+      "epoch": 0.32042075452614543,
+      "grad_norm": 47.023624420166016,
+      "learning_rate": 0.00019598606389421055,
+      "loss": 26.3533,
+      "step": 594
+    },
+    {
+      "epoch": 0.3209601834058191,
+      "grad_norm": 53.969627380371094,
+      "learning_rate": 0.00019596990684567063,
+      "loss": 36.3338,
+      "step": 595
+    },
+    {
+      "epoch": 0.32149961228549273,
+      "grad_norm": 31.71206283569336,
+      "learning_rate": 0.00019595371801312588,
+      "loss": 23.1099,
+      "step": 596
+    },
+    {
+      "epoch": 0.3220390411651664,
+      "grad_norm": 34.602901458740234,
+      "learning_rate": 0.00019593749740193784,
+      "loss": 20.7281,
+      "step": 597
+    },
+    {
+      "epoch": 0.32257847004484,
+      "grad_norm": 32.23836135864258,
+      "learning_rate": 0.00019592124501747855,
+      "loss": 19.1565,
+      "step": 598
+    },
+    {
+      "epoch": 0.32311789892451365,
+      "grad_norm": 31.762807846069336,
+      "learning_rate": 0.00019590496086513063,
+      "loss": 20.822,
+      "step": 599
+    },
+    {
+      "epoch": 0.3236573278041873,
+      "grad_norm": 38.77958297729492,
+      "learning_rate": 0.00019588864495028712,
+      "loss": 20.7172,
+      "step": 600
+    },
+    {
+      "epoch": 0.3236573278041873,
+      "eval_loss": 1.5770864486694336,
+      "eval_runtime": 140.3936,
+      "eval_samples_per_second": 2.13,
+      "eval_steps_per_second": 2.13,
+      "step": 600
     }
   ],
   "logging_steps": 1,
       "attributes": {}
     }
   },
+  "total_flos": 1.0696873835715625e+18,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

last-checkpoint/training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:75d521b926cc9b52658f752edb06fe011094ff558c587d132452d4dea2c1c386
 size 6776

 version https://git-lfs.github.com/spec/v1
+oid sha256:60cf1b2e269c89ca03e3babc5e2888ab355b88cc062be4295ed1566822fe04c6
 size 6776