Training in progress, step 2425

Browse files

Files changed (6) hide show

adapter_model.safetensors +1 -1
last-checkpoint/adapter_config.json +5 -5
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +1 -1
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +3 -473

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16867459c238ecc7a5d407fbde70d292d80c5c02caba34a26ed29ea260f4db5d
 size 108113968

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab0efb69d2dd4ccd18add7c8d575a842a3d074ef54403c99c6db819af71c77a1
 size 108113968

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -29,13 +29,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "gate_proj",
-    "v_proj",
     "k_proj",
     "o_proj",
-    "down_proj",
-    "q_proj",
-    "up_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "down_proj",
     "k_proj",
+    "up_proj",
     "o_proj",
+    "gate_proj",
+    "v_proj",
+    "q_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:16867459c238ecc7a5d407fbde70d292d80c5c02caba34a26ed29ea260f4db5d
 size 108113968

 version https://git-lfs.github.com/spec/v1
+oid sha256:23cc74daf6c70e4089aff09333b0706b30bba28a0cf6991c49bb172b7614c70a
 size 108113968

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8efc06c38fa74258b408de16da02e778df592df2079aa509df3315fa14619e4
 size 57081771

 version https://git-lfs.github.com/spec/v1
+oid sha256:45e2021b79e47156a888c5a9b65619596c377b821c0a36373339d1a5a3dfdb5b
 size 57081771

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:14dad153ace9f17b878a1326b68c8639a626f910f5e0ed1e9324c8e1af846b2d
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:bb8c24123d0a6abb40712c04ec45e32a580173995d543bee32e57aefd8bd098c
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.6878761822871883,
   "eval_steps": 300,
-  "global_step": 2400,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -498,476 +498,6 @@
       "mean_token_accuracy": 0.9329368638992309,
       "num_tokens": 122190.0,
       "step": 1225
-    },
-    {
-      "entropy": 0.2075369682908058,
-      "epoch": 0.3582688449412439,
-      "grad_norm": 0.39205244183540344,
-      "learning_rate": 0.0002,
-      "loss": 0.2030281639099121,
-      "mean_token_accuracy": 0.9333784127235413,
-      "num_tokens": 40591.0,
-      "step": 1250
-    },
-    {
-      "entropy": 0.21607059925794603,
-      "epoch": 0.3654342218400688,
-      "grad_norm": 0.24999791383743286,
-      "learning_rate": 0.0002,
-      "loss": 0.21144355773925783,
-      "mean_token_accuracy": 0.93144282579422,
-      "num_tokens": 81873.0,
-      "step": 1275
-    },
-    {
-      "entropy": 0.2062842446565628,
-      "epoch": 0.37259959873889364,
-      "grad_norm": 0.36865198612213135,
-      "learning_rate": 0.0002,
-      "loss": 0.20512981414794923,
-      "mean_token_accuracy": 0.9319202589988709,
-      "num_tokens": 122819.0,
-      "step": 1300
-    },
-    {
-      "entropy": 0.20617701470851899,
-      "epoch": 0.37976497563771855,
-      "grad_norm": 0.2900436222553253,
-      "learning_rate": 0.0002,
-      "loss": 0.20203329086303712,
-      "mean_token_accuracy": 0.9338054943084717,
-      "num_tokens": 164426.0,
-      "step": 1325
-    },
-    {
-      "entropy": 0.20276340633630752,
-      "epoch": 0.3869303525365434,
-      "grad_norm": 0.3424394130706787,
-      "learning_rate": 0.0002,
-      "loss": 0.19647628784179688,
-      "mean_token_accuracy": 0.9352651834487915,
-      "num_tokens": 204693.0,
-      "step": 1350
-    },
-    {
-      "entropy": 0.19353608846664427,
-      "epoch": 0.3940957294353683,
-      "grad_norm": 0.2800115644931793,
-      "learning_rate": 0.0002,
-      "loss": 0.1909392547607422,
-      "mean_token_accuracy": 0.936229157447815,
-      "num_tokens": 245202.0,
-      "step": 1375
-    },
-    {
-      "entropy": 0.20765207797288895,
-      "epoch": 0.40126110633419315,
-      "grad_norm": 0.29286009073257446,
-      "learning_rate": 0.0002,
-      "loss": 0.20011020660400392,
-      "mean_token_accuracy": 0.9337928628921509,
-      "num_tokens": 286944.0,
-      "step": 1400
-    },
-    {
-      "entropy": 0.2191137021780014,
-      "epoch": 0.40842648323301806,
-      "grad_norm": 0.26620274782180786,
-      "learning_rate": 0.0002,
-      "loss": 0.21411985397338867,
-      "mean_token_accuracy": 0.9297138857841492,
-      "num_tokens": 329201.0,
-      "step": 1425
-    },
-    {
-      "entropy": 0.21264588654041292,
-      "epoch": 0.41559186013184296,
-      "grad_norm": 0.38385578989982605,
-      "learning_rate": 0.0002,
-      "loss": 0.208143367767334,
-      "mean_token_accuracy": 0.9316162085533142,
-      "num_tokens": 371891.0,
-      "step": 1450
-    },
-    {
-      "entropy": 0.2007578819990158,
-      "epoch": 0.4227572370306678,
-      "grad_norm": 0.3052174746990204,
-      "learning_rate": 0.0002,
-      "loss": 0.19854948043823242,
-      "mean_token_accuracy": 0.9345853734016418,
-      "num_tokens": 412534.0,
-      "step": 1475
-    },
-    {
-      "entropy": 0.20256735682487487,
-      "epoch": 0.4299226139294927,
-      "grad_norm": 0.2761523723602295,
-      "learning_rate": 0.0002,
-      "loss": 0.19539085388183594,
-      "mean_token_accuracy": 0.9367341923713685,
-      "num_tokens": 452855.0,
-      "step": 1500
-    },
-    {
-      "entropy": 0.19164222806692124,
-      "epoch": 0.43708799082831756,
-      "grad_norm": 0.3495299220085144,
-      "learning_rate": 0.0002,
-      "loss": 0.1890553665161133,
-      "mean_token_accuracy": 0.9373278784751892,
-      "num_tokens": 493202.0,
-      "step": 1525
-    },
-    {
-      "entropy": 0.20341520249843598,
-      "epoch": 0.44425336772714247,
-      "grad_norm": 0.3206697702407837,
-      "learning_rate": 0.0002,
-      "loss": 0.20173826217651367,
-      "mean_token_accuracy": 0.9332520008087158,
-      "num_tokens": 534946.0,
-      "step": 1550
-    },
-    {
-      "entropy": 0.20512860178947448,
-      "epoch": 0.4514187446259673,
-      "grad_norm": 0.369289755821228,
-      "learning_rate": 0.0002,
-      "loss": 0.1998735237121582,
-      "mean_token_accuracy": 0.9336103320121765,
-      "num_tokens": 576115.0,
-      "step": 1575
-    },
-    {
-      "entropy": 0.19730552673339843,
-      "epoch": 0.4585841215247922,
-      "grad_norm": 0.1693185716867447,
-      "learning_rate": 0.0002,
-      "loss": 0.19725181579589843,
-      "mean_token_accuracy": 0.9343607997894288,
-      "num_tokens": 616923.0,
-      "step": 1600
-    },
-    {
-      "entropy": 0.20145605146884918,
-      "epoch": 0.46574949842361707,
-      "grad_norm": 0.34067076444625854,
-      "learning_rate": 0.0002,
-      "loss": 0.19863763809204102,
-      "mean_token_accuracy": 0.9344722628593445,
-      "num_tokens": 658021.0,
-      "step": 1625
-    },
-    {
-      "entropy": 0.19174030989408494,
-      "epoch": 0.472914875322442,
-      "grad_norm": 0.282787024974823,
-      "learning_rate": 0.0002,
-      "loss": 0.18856592178344728,
-      "mean_token_accuracy": 0.9382144474983215,
-      "num_tokens": 698701.0,
-      "step": 1650
-    },
-    {
-      "entropy": 0.19893687069416047,
-      "epoch": 0.4800802522212668,
-      "grad_norm": 0.21854329109191895,
-      "learning_rate": 0.0002,
-      "loss": 0.19327503204345703,
-      "mean_token_accuracy": 0.9353450679779053,
-      "num_tokens": 739913.0,
-      "step": 1675
-    },
-    {
-      "entropy": 0.19346537590026855,
-      "epoch": 0.48724562912009173,
-      "grad_norm": 0.19436436891555786,
-      "learning_rate": 0.0002,
-      "loss": 0.19321285247802733,
-      "mean_token_accuracy": 0.9373372173309327,
-      "num_tokens": 780719.0,
-      "step": 1700
-    },
-    {
-      "entropy": 0.20528113186359406,
-      "epoch": 0.4944110060189166,
-      "grad_norm": 0.31415456533432007,
-      "learning_rate": 0.0002,
-      "loss": 0.2044132423400879,
-      "mean_token_accuracy": 0.9320711612701416,
-      "num_tokens": 822130.0,
-      "step": 1725
-    },
-    {
-      "entropy": 0.20051146537065506,
-      "epoch": 0.5015763829177414,
-      "grad_norm": 0.36767083406448364,
-      "learning_rate": 0.0002,
-      "loss": 0.19968202590942383,
-      "mean_token_accuracy": 0.9361233901977539,
-      "num_tokens": 863055.0,
-      "step": 1750
-    },
-    {
-      "entropy": 0.19146274596452714,
-      "epoch": 0.5087417598165663,
-      "grad_norm": 0.36641210317611694,
-      "learning_rate": 0.0002,
-      "loss": 0.1849520492553711,
-      "mean_token_accuracy": 0.9378811025619507,
-      "num_tokens": 903979.0,
-      "step": 1775
-    },
-    {
-      "entropy": 0.20497863948345185,
-      "epoch": 0.5159071367153912,
-      "grad_norm": 0.41181716322898865,
-      "learning_rate": 0.0002,
-      "loss": 0.2043849754333496,
-      "mean_token_accuracy": 0.9320770597457886,
-      "num_tokens": 945010.0,
-      "step": 1800
-    },
-    {
-      "entropy": 0.19871506720781326,
-      "epoch": 0.5230725136142161,
-      "grad_norm": 0.34865760803222656,
-      "learning_rate": 0.0002,
-      "loss": 0.19058765411376954,
-      "mean_token_accuracy": 0.936968915462494,
-      "num_tokens": 985351.0,
-      "step": 1825
-    },
-    {
-      "entropy": 0.21031922459602356,
-      "epoch": 0.5302378905130409,
-      "grad_norm": 0.35983604192733765,
-      "learning_rate": 0.0002,
-      "loss": 0.20398990631103517,
-      "mean_token_accuracy": 0.9338763618469238,
-      "num_tokens": 1027146.0,
-      "step": 1850
-    },
-    {
-      "entropy": 0.20145108669996262,
-      "epoch": 0.5374032674118658,
-      "grad_norm": 0.2126716524362564,
-      "learning_rate": 0.0002,
-      "loss": 0.19558551788330078,
-      "mean_token_accuracy": 0.9350816106796265,
-      "num_tokens": 1068454.0,
-      "step": 1875
-    },
-    {
-      "entropy": 0.19600239813327788,
-      "epoch": 0.5445686443106907,
-      "grad_norm": 0.2547587752342224,
-      "learning_rate": 0.0002,
-      "loss": 0.18890924453735353,
-      "mean_token_accuracy": 0.9360025477409363,
-      "num_tokens": 1109230.0,
-      "step": 1900
-    },
-    {
-      "entropy": 0.17782112330198288,
-      "epoch": 0.5517340212095156,
-      "grad_norm": 0.28866520524024963,
-      "learning_rate": 0.0002,
-      "loss": 0.17644382476806642,
-      "mean_token_accuracy": 0.9422430300712585,
-      "num_tokens": 1148978.0,
-      "step": 1925
-    },
-    {
-      "entropy": 0.18634845435619354,
-      "epoch": 0.5588993981083406,
-      "grad_norm": 0.2348451316356659,
-      "learning_rate": 0.0002,
-      "loss": 0.1815641212463379,
-      "mean_token_accuracy": 0.9392524695396424,
-      "num_tokens": 1189196.0,
-      "step": 1950
-    },
-    {
-      "entropy": 0.18852397054433823,
-      "epoch": 0.5660647750071653,
-      "grad_norm": 0.25562164187431335,
-      "learning_rate": 0.0002,
-      "loss": 0.18350950241088868,
-      "mean_token_accuracy": 0.9394415140151977,
-      "num_tokens": 1229072.0,
-      "step": 1975
-    },
-    {
-      "entropy": 0.18256970256567,
-      "epoch": 0.5732301519059902,
-      "grad_norm": 0.36442917585372925,
-      "learning_rate": 0.0002,
-      "loss": 0.18093914031982422,
-      "mean_token_accuracy": 0.9397966265678406,
-      "num_tokens": 1269371.0,
-      "step": 2000
-    },
-    {
-      "entropy": 0.20554341971874238,
-      "epoch": 0.5803955288048152,
-      "grad_norm": 0.3102213442325592,
-      "learning_rate": 0.0002,
-      "loss": 0.2052627372741699,
-      "mean_token_accuracy": 0.9325902485847473,
-      "num_tokens": 1311354.0,
-      "step": 2025
-    },
-    {
-      "entropy": 0.2037496653199196,
-      "epoch": 0.5875609057036401,
-      "grad_norm": 0.24330857396125793,
-      "learning_rate": 0.0002,
-      "loss": 0.20022052764892578,
-      "mean_token_accuracy": 0.9342735767364502,
-      "num_tokens": 1353051.0,
-      "step": 2050
-    },
-    {
-      "entropy": 0.19858424603939057,
-      "epoch": 0.5947262826024649,
-      "grad_norm": 0.2955344021320343,
-      "learning_rate": 0.0002,
-      "loss": 0.19497306823730468,
-      "mean_token_accuracy": 0.9353799057006836,
-      "num_tokens": 1394712.0,
-      "step": 2075
-    },
-    {
-      "entropy": 0.20194011509418489,
-      "epoch": 0.6018916595012898,
-      "grad_norm": 0.20898522436618805,
-      "learning_rate": 0.0002,
-      "loss": 0.19739873886108397,
-      "mean_token_accuracy": 0.9346665263175964,
-      "num_tokens": 1436282.0,
-      "step": 2100
-    },
-    {
-      "entropy": 0.18827197730541229,
-      "epoch": 0.6090570364001147,
-      "grad_norm": 0.3064703643321991,
-      "learning_rate": 0.0002,
-      "loss": 0.1838802719116211,
-      "mean_token_accuracy": 0.939849009513855,
-      "num_tokens": 1476569.0,
-      "step": 2125
-    },
-    {
-      "entropy": 0.20322401821613312,
-      "epoch": 0.6162224132989396,
-      "grad_norm": 0.42201489210128784,
-      "learning_rate": 0.0002,
-      "loss": 0.20033023834228517,
-      "mean_token_accuracy": 0.9345659923553467,
-      "num_tokens": 1518315.0,
-      "step": 2150
-    },
-    {
-      "entropy": 0.1822732812166214,
-      "epoch": 0.6233877901977644,
-      "grad_norm": 0.2799566388130188,
-      "learning_rate": 0.0002,
-      "loss": 0.18143136978149413,
-      "mean_token_accuracy": 0.9404231834411622,
-      "num_tokens": 1558340.0,
-      "step": 2175
-    },
-    {
-      "entropy": 0.19505684196949005,
-      "epoch": 0.6305531670965893,
-      "grad_norm": 0.20578612387180328,
-      "learning_rate": 0.0002,
-      "loss": 0.18889547348022462,
-      "mean_token_accuracy": 0.9381808185577393,
-      "num_tokens": 1599592.0,
-      "step": 2200
-    },
-    {
-      "entropy": 0.19981920778751372,
-      "epoch": 0.6377185439954142,
-      "grad_norm": 0.28131991624832153,
-      "learning_rate": 0.0002,
-      "loss": 0.19793636322021485,
-      "mean_token_accuracy": 0.935631537437439,
-      "num_tokens": 1641401.0,
-      "step": 2225
-    },
-    {
-      "entropy": 0.19168403446674348,
-      "epoch": 0.6448839208942391,
-      "grad_norm": 0.25856539607048035,
-      "learning_rate": 0.0002,
-      "loss": 0.1897783088684082,
-      "mean_token_accuracy": 0.9356019353866577,
-      "num_tokens": 1682949.0,
-      "step": 2250
-    },
-    {
-      "entropy": 0.1931222453713417,
-      "epoch": 0.6520492977930639,
-      "grad_norm": 0.4090195596218109,
-      "learning_rate": 0.0002,
-      "loss": 0.1929492950439453,
-      "mean_token_accuracy": 0.9369300937652588,
-      "num_tokens": 1724557.0,
-      "step": 2275
-    },
-    {
-      "entropy": 0.19567115902900695,
-      "epoch": 0.6592146746918888,
-      "grad_norm": 0.19224579632282257,
-      "learning_rate": 0.0002,
-      "loss": 0.19031965255737304,
-      "mean_token_accuracy": 0.9367053961753845,
-      "num_tokens": 1765618.0,
-      "step": 2300
-    },
-    {
-      "entropy": 0.18622912466526031,
-      "epoch": 0.6663800515907137,
-      "grad_norm": 0.27013909816741943,
-      "learning_rate": 0.0002,
-      "loss": 0.18465063095092774,
-      "mean_token_accuracy": 0.9383154940605164,
-      "num_tokens": 1806491.0,
-      "step": 2325
-    },
-    {
-      "entropy": 0.19851551949977875,
-      "epoch": 0.6735454284895386,
-      "grad_norm": 0.3999996483325958,
-      "learning_rate": 0.0002,
-      "loss": 0.19640205383300782,
-      "mean_token_accuracy": 0.9344918823242188,
-      "num_tokens": 1848741.0,
-      "step": 2350
-    },
-    {
-      "entropy": 0.18972006916999817,
-      "epoch": 0.6807108053883635,
-      "grad_norm": 0.26580268144607544,
-      "learning_rate": 0.0002,
-      "loss": 0.1871095657348633,
-      "mean_token_accuracy": 0.9390228629112244,
-      "num_tokens": 1890071.0,
-      "step": 2375
-    },
-    {
-      "entropy": 0.19580536246299743,
-      "epoch": 0.6878761822871883,
-      "grad_norm": 0.2682396471500397,
-      "learning_rate": 0.0002,
-      "loss": 0.19406461715698242,
-      "mean_token_accuracy": 0.9354706478118896,
-      "num_tokens": 1931751.0,
-      "step": 2400
     }
   ],
   "logging_steps": 25,
@@ -987,7 +517,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2.329083936152494e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.35110346804241904,
   "eval_steps": 300,
+  "global_step": 1225,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "mean_token_accuracy": 0.9329368638992309,
       "num_tokens": 122190.0,
       "step": 1225
     }
   ],
   "logging_steps": 25,
       "attributes": {}
     }
   },
+  "total_flos": 1.1890534404816077e+17,
   "train_batch_size": 4,
   "trial_name": null,
   "trial_params": null