Training in progress, step 1500, checkpoint

Browse files

Files changed (5) hide show

last-checkpoint/adapter_config.json +4 -4
last-checkpoint/adapter_model.safetensors +1 -1
last-checkpoint/optimizer.pt +2 -2
last-checkpoint/scheduler.pt +1 -1
last-checkpoint/trainer_state.json +633 -3

last-checkpoint/adapter_config.json CHANGED Viewed

@@ -33,13 +33,13 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "up_proj",
-    "q_proj",
     "v_proj",
-    "k_proj",
-    "o_proj",
     "gate_proj",
-    "down_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

   "rank_pattern": {},
   "revision": null,
   "target_modules": [
+    "o_proj",
+    "down_proj",
     "up_proj",
     "v_proj",
+    "q_proj",
     "gate_proj",
+    "k_proj"
   ],
   "target_parameters": null,
   "task_type": "CAUSAL_LM",

last-checkpoint/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac86271e61f61c7e0e996c9a0b387781c0cf7e105d9e2809cf486578571e3692
 size 121537408

 version https://git-lfs.github.com/spec/v1
+oid sha256:817afca3f28ecc00af288ad47b0c44001f470fcb6e47e6d851d9f7f91f49bf73
 size 121537408

last-checkpoint/optimizer.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e6166d92c0eec7a743b35a9b0a6952a1ccf685a19c88ceec877f066ad8bcf660
-size 62000725

 version https://git-lfs.github.com/spec/v1
+oid sha256:926e38ffe3e05f541d9d7c735821d431b86fb526eadfa0605dc6186036c0bd0e
+size 62655371

last-checkpoint/scheduler.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f2f5818bcde61cb939645ced952eb7a6ec5c7bbec5f630a7156d7c2ae39b50d0
 size 1465

 version https://git-lfs.github.com/spec/v1
+oid sha256:fa1cd24339e72e908e643df363a721bfe37400f8ac5f6edea2f572d9a9f52a64
 size 1465

last-checkpoint/trainer_state.json CHANGED Viewed

@@ -2,9 +2,9 @@
   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.2059202059202059,
   "eval_steps": 500,
-  "global_step": 600,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -428,6 +428,636 @@
       "learning_rate": 0.00019783319385748891,
       "loss": 0.005371841043233872,
       "step": 600
     }
   ],
   "logging_steps": 10,
@@ -447,7 +1077,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 2276492884035072.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

   "best_global_step": null,
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.5148005148005148,
   "eval_steps": 500,
+  "global_step": 1500,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "learning_rate": 0.00019783319385748891,
       "loss": 0.005371841043233872,
       "step": 600
+    },
+    {
+      "epoch": 0.20935220935220936,
+      "grad_norm": 0.03531381115317345,
+      "learning_rate": 0.00019775798436093438,
+      "loss": 0.0034491006284952165,
+      "step": 610
+    },
+    {
+      "epoch": 0.21278421278421278,
+      "grad_norm": 0.3441513478755951,
+      "learning_rate": 0.00019768150657209797,
+      "loss": 0.0036659084260463716,
+      "step": 620
+    },
+    {
+      "epoch": 0.21621621621621623,
+      "grad_norm": 0.12480789422988892,
+      "learning_rate": 0.00019760376148318697,
+      "loss": 0.005842024832963944,
+      "step": 630
+    },
+    {
+      "epoch": 0.21964821964821965,
+      "grad_norm": 0.18049751222133636,
+      "learning_rate": 0.00019752475010285044,
+      "loss": 0.004476210474967957,
+      "step": 640
+    },
+    {
+      "epoch": 0.22308022308022307,
+      "grad_norm": 0.03993777185678482,
+      "learning_rate": 0.00019744447345616603,
+      "loss": 0.002967344969511032,
+      "step": 650
+    },
+    {
+      "epoch": 0.22651222651222652,
+      "grad_norm": 0.014075578190386295,
+      "learning_rate": 0.00019736293258462663,
+      "loss": 0.0038539741188287737,
+      "step": 660
+    },
+    {
+      "epoch": 0.22994422994422994,
+      "grad_norm": 0.10091689974069595,
+      "learning_rate": 0.00019728012854612707,
+      "loss": 0.0027353862300515176,
+      "step": 670
+    },
+    {
+      "epoch": 0.23337623337623337,
+      "grad_norm": 0.07889121025800705,
+      "learning_rate": 0.00019719606241495015,
+      "loss": 0.002710958570241928,
+      "step": 680
+    },
+    {
+      "epoch": 0.23680823680823682,
+      "grad_norm": 0.30752819776535034,
+      "learning_rate": 0.00019711073528175276,
+      "loss": 0.00333489403128624,
+      "step": 690
+    },
+    {
+      "epoch": 0.24024024024024024,
+      "grad_norm": 0.044186487793922424,
+      "learning_rate": 0.00019702414825355192,
+      "loss": 0.0013431076891720294,
+      "step": 700
+    },
+    {
+      "epoch": 0.24367224367224366,
+      "grad_norm": 0.002193969674408436,
+      "learning_rate": 0.00019693630245371012,
+      "loss": 0.0016361255198717116,
+      "step": 710
+    },
+    {
+      "epoch": 0.2471042471042471,
+      "grad_norm": 0.0048366570845246315,
+      "learning_rate": 0.00019684719902192098,
+      "loss": 0.0031337443739175796,
+      "step": 720
+    },
+    {
+      "epoch": 0.25053625053625056,
+      "grad_norm": 0.003241632366552949,
+      "learning_rate": 0.0001967568391141944,
+      "loss": 0.0018327673897147179,
+      "step": 730
+    },
+    {
+      "epoch": 0.25396825396825395,
+      "grad_norm": 0.07791941612958908,
+      "learning_rate": 0.00019666522390284155,
+      "loss": 0.005795871838927269,
+      "step": 740
+    },
+    {
+      "epoch": 0.2574002574002574,
+      "grad_norm": 0.04366520047187805,
+      "learning_rate": 0.00019657235457645956,
+      "loss": 0.0033348515629768372,
+      "step": 750
+    },
+    {
+      "epoch": 0.26083226083226085,
+      "grad_norm": 0.013953814283013344,
+      "learning_rate": 0.00019647823233991623,
+      "loss": 0.00804762840270996,
+      "step": 760
+    },
+    {
+      "epoch": 0.26426426426426425,
+      "grad_norm": 0.017218926921486855,
+      "learning_rate": 0.00019638285841433442,
+      "loss": 0.005346460640430451,
+      "step": 770
+    },
+    {
+      "epoch": 0.2676962676962677,
+      "grad_norm": 1.2277870178222656,
+      "learning_rate": 0.00019628623403707612,
+      "loss": 0.005925276502966881,
+      "step": 780
+    },
+    {
+      "epoch": 0.27112827112827115,
+      "grad_norm": 0.05051695927977562,
+      "learning_rate": 0.00019618836046172647,
+      "loss": 0.0050374619662761685,
+      "step": 790
+    },
+    {
+      "epoch": 0.27456027456027454,
+      "grad_norm": 0.004715205170214176,
+      "learning_rate": 0.00019608923895807732,
+      "loss": 0.0031466834247112275,
+      "step": 800
+    },
+    {
+      "epoch": 0.277992277992278,
+      "grad_norm": 0.15201859176158905,
+      "learning_rate": 0.00019598887081211103,
+      "loss": 0.00553952269256115,
+      "step": 810
+    },
+    {
+      "epoch": 0.28142428142428144,
+      "grad_norm": 0.017294466495513916,
+      "learning_rate": 0.00019588725732598358,
+      "loss": 0.0026330363005399706,
+      "step": 820
+    },
+    {
+      "epoch": 0.28485628485628484,
+      "grad_norm": 0.009572334587574005,
+      "learning_rate": 0.0001957843998180077,
+      "loss": 0.0053256206214427945,
+      "step": 830
+    },
+    {
+      "epoch": 0.2882882882882883,
+      "grad_norm": 0.9419263005256653,
+      "learning_rate": 0.00019568029962263592,
+      "loss": 0.005014676600694656,
+      "step": 840
+    },
+    {
+      "epoch": 0.29172029172029174,
+      "grad_norm": 0.007972943596541882,
+      "learning_rate": 0.0001955749580904431,
+      "loss": 0.002865707501769066,
+      "step": 850
+    },
+    {
+      "epoch": 0.29515229515229513,
+      "grad_norm": 0.0037338004913181067,
+      "learning_rate": 0.00019546837658810883,
+      "loss": 0.002737715095281601,
+      "step": 860
+    },
+    {
+      "epoch": 0.2985842985842986,
+      "grad_norm": 0.044745851308107376,
+      "learning_rate": 0.00019536055649840007,
+      "loss": 0.005683861300349235,
+      "step": 870
+    },
+    {
+      "epoch": 0.30201630201630203,
+      "grad_norm": 0.3476060628890991,
+      "learning_rate": 0.00019525149922015268,
+      "loss": 0.007439766824245453,
+      "step": 880
+    },
+    {
+      "epoch": 0.3054483054483054,
+      "grad_norm": 0.046753134578466415,
+      "learning_rate": 0.00019514120616825377,
+      "loss": 0.009560897201299667,
+      "step": 890
+    },
+    {
+      "epoch": 0.3088803088803089,
+      "grad_norm": 0.6365974545478821,
+      "learning_rate": 0.00019502967877362305,
+      "loss": 0.006552433967590332,
+      "step": 900
+    },
+    {
+      "epoch": 0.3123123123123123,
+      "grad_norm": 1.7795714139938354,
+      "learning_rate": 0.00019491691848319432,
+      "loss": 0.0097378209233284,
+      "step": 910
+    },
+    {
+      "epoch": 0.3157443157443157,
+      "grad_norm": 0.030942745506763458,
+      "learning_rate": 0.00019480292675989677,
+      "loss": 0.011464773118495942,
+      "step": 920
+    },
+    {
+      "epoch": 0.31917631917631917,
+      "grad_norm": 0.05824369192123413,
+      "learning_rate": 0.00019468770508263586,
+      "loss": 0.0077786631882190704,
+      "step": 930
+    },
+    {
+      "epoch": 0.3226083226083226,
+      "grad_norm": 1.115598440170288,
+      "learning_rate": 0.00019457125494627431,
+      "loss": 0.005580966919660568,
+      "step": 940
+    },
+    {
+      "epoch": 0.32604032604032607,
+      "grad_norm": 0.11986860632896423,
+      "learning_rate": 0.00019445357786161265,
+      "loss": 0.01756148934364319,
+      "step": 950
+    },
+    {
+      "epoch": 0.32947232947232946,
+      "grad_norm": 0.026070566847920418,
+      "learning_rate": 0.00019433467535536947,
+      "loss": 0.01354750245809555,
+      "step": 960
+    },
+    {
+      "epoch": 0.3329043329043329,
+      "grad_norm": 0.11441248655319214,
+      "learning_rate": 0.0001942145489701618,
+      "loss": 0.008156213909387589,
+      "step": 970
+    },
+    {
+      "epoch": 0.33633633633633636,
+      "grad_norm": 0.3980163037776947,
+      "learning_rate": 0.00019409320026448504,
+      "loss": 0.0047673903405666355,
+      "step": 980
+    },
+    {
+      "epoch": 0.33976833976833976,
+      "grad_norm": 1.6305179595947266,
+      "learning_rate": 0.0001939706308126927,
+      "loss": 0.012219312787055969,
+      "step": 990
+    },
+    {
+      "epoch": 0.3432003432003432,
+      "grad_norm": 0.3982492983341217,
+      "learning_rate": 0.00019384684220497605,
+      "loss": 0.011943883448839187,
+      "step": 1000
+    },
+    {
+      "epoch": 0.34663234663234666,
+      "grad_norm": 0.8722233772277832,
+      "learning_rate": 0.00019372183604734336,
+      "loss": 0.01119406521320343,
+      "step": 1010
+    },
+    {
+      "epoch": 0.35006435006435005,
+      "grad_norm": 0.4889911413192749,
+      "learning_rate": 0.00019359561396159922,
+      "loss": 0.01964961290359497,
+      "step": 1020
+    },
+    {
+      "epoch": 0.3534963534963535,
+      "grad_norm": 2.336963176727295,
+      "learning_rate": 0.00019346817758532337,
+      "loss": 0.013343000411987304,
+      "step": 1030
+    },
+    {
+      "epoch": 0.35692835692835695,
+      "grad_norm": 5.095973014831543,
+      "learning_rate": 0.0001933395285718495,
+      "loss": 0.045030930638313295,
+      "step": 1040
+    },
+    {
+      "epoch": 0.36036036036036034,
+      "grad_norm": 1.8921918869018555,
+      "learning_rate": 0.00019320966859024397,
+      "loss": 0.017123931646347047,
+      "step": 1050
+    },
+    {
+      "epoch": 0.3637923637923638,
+      "grad_norm": 0.32683122158050537,
+      "learning_rate": 0.00019307859932528375,
+      "loss": 0.0226660281419754,
+      "step": 1060
+    },
+    {
+      "epoch": 0.36722436722436724,
+      "grad_norm": 1.3853524923324585,
+      "learning_rate": 0.000192946322477435,
+      "loss": 0.009634046256542206,
+      "step": 1070
+    },
+    {
+      "epoch": 0.37065637065637064,
+      "grad_norm": 0.34646913409233093,
+      "learning_rate": 0.0001928128397628307,
+      "loss": 0.03943045735359192,
+      "step": 1080
+    },
+    {
+      "epoch": 0.3740883740883741,
+      "grad_norm": 0.8397905826568604,
+      "learning_rate": 0.00019267815291324852,
+      "loss": 0.017884735763072968,
+      "step": 1090
+    },
+    {
+      "epoch": 0.37752037752037754,
+      "grad_norm": 0.9594695568084717,
+      "learning_rate": 0.00019254226367608842,
+      "loss": 0.01769815683364868,
+      "step": 1100
+    },
+    {
+      "epoch": 0.38095238095238093,
+      "grad_norm": 0.7242799997329712,
+      "learning_rate": 0.0001924051738143498,
+      "loss": 0.04254389405250549,
+      "step": 1110
+    },
+    {
+      "epoch": 0.3843843843843844,
+      "grad_norm": 0.1905749887228012,
+      "learning_rate": 0.00019226688510660877,
+      "loss": 0.011978869885206222,
+      "step": 1120
+    },
+    {
+      "epoch": 0.38781638781638783,
+      "grad_norm": 0.12193301320075989,
+      "learning_rate": 0.00019212739934699498,
+      "loss": 0.010143650323152542,
+      "step": 1130
+    },
+    {
+      "epoch": 0.3912483912483912,
+      "grad_norm": 0.20223842561244965,
+      "learning_rate": 0.00019198671834516843,
+      "loss": 0.012704399228096009,
+      "step": 1140
+    },
+    {
+      "epoch": 0.3946803946803947,
+      "grad_norm": 0.16973139345645905,
+      "learning_rate": 0.00019184484392629586,
+      "loss": 0.009967386722564697,
+      "step": 1150
+    },
+    {
+      "epoch": 0.3981123981123981,
+      "grad_norm": 0.3374408185482025,
+      "learning_rate": 0.00019170177793102736,
+      "loss": 0.013026086986064911,
+      "step": 1160
+    },
+    {
+      "epoch": 0.4015444015444015,
+      "grad_norm": 0.6263651847839355,
+      "learning_rate": 0.0001915575222154721,
+      "loss": 0.014929966628551483,
+      "step": 1170
+    },
+    {
+      "epoch": 0.40497640497640497,
+      "grad_norm": 0.5159519910812378,
+      "learning_rate": 0.00019141207865117448,
+      "loss": 0.022531284391880034,
+      "step": 1180
+    },
+    {
+      "epoch": 0.4084084084084084,
+      "grad_norm": 0.27669215202331543,
+      "learning_rate": 0.0001912654491250899,
+      "loss": 0.013255235552787781,
+      "step": 1190
+    },
+    {
+      "epoch": 0.4118404118404118,
+      "grad_norm": 0.3306339383125305,
+      "learning_rate": 0.00019111763553956006,
+      "loss": 0.0071789674460887905,
+      "step": 1200
+    },
+    {
+      "epoch": 0.41527241527241526,
+      "grad_norm": 0.1400749832391739,
+      "learning_rate": 0.0001909686398122885,
+      "loss": 0.012304867804050445,
+      "step": 1210
+    },
+    {
+      "epoch": 0.4187044187044187,
+      "grad_norm": 0.480392724275589,
+      "learning_rate": 0.0001908184638763156,
+      "loss": 0.013182352483272552,
+      "step": 1220
+    },
+    {
+      "epoch": 0.42213642213642216,
+      "grad_norm": 0.07393156737089157,
+      "learning_rate": 0.00019066710967999352,
+      "loss": 0.01357671171426773,
+      "step": 1230
+    },
+    {
+      "epoch": 0.42556842556842556,
+      "grad_norm": 0.5818315148353577,
+      "learning_rate": 0.00019051457918696092,
+      "loss": 0.01494317352771759,
+      "step": 1240
+    },
+    {
+      "epoch": 0.429000429000429,
+      "grad_norm": 0.12537962198257446,
+      "learning_rate": 0.0001903608743761175,
+      "loss": 0.016142460703849792,
+      "step": 1250
+    },
+    {
+      "epoch": 0.43243243243243246,
+      "grad_norm": 0.6750714778900146,
+      "learning_rate": 0.00019020599724159842,
+      "loss": 0.010620266944169999,
+      "step": 1260
+    },
+    {
+      "epoch": 0.43586443586443585,
+      "grad_norm": 2.9211416244506836,
+      "learning_rate": 0.00019004994979274816,
+      "loss": 0.02269883006811142,
+      "step": 1270
+    },
+    {
+      "epoch": 0.4392964392964393,
+      "grad_norm": 0.16892600059509277,
+      "learning_rate": 0.0001898927340540947,
+      "loss": 0.020440049469470978,
+      "step": 1280
+    },
+    {
+      "epoch": 0.44272844272844275,
+      "grad_norm": 1.130327820777893,
+      "learning_rate": 0.00018973435206532323,
+      "loss": 0.012587438523769378,
+      "step": 1290
+    },
+    {
+      "epoch": 0.44616044616044614,
+      "grad_norm": 1.3412842750549316,
+      "learning_rate": 0.00018957480588124956,
+      "loss": 0.009808909147977829,
+      "step": 1300
+    },
+    {
+      "epoch": 0.4495924495924496,
+      "grad_norm": 0.8361186385154724,
+      "learning_rate": 0.00018941409757179353,
+      "loss": 0.010318058729171752,
+      "step": 1310
+    },
+    {
+      "epoch": 0.45302445302445304,
+      "grad_norm": 0.030115943402051926,
+      "learning_rate": 0.00018925222922195223,
+      "loss": 0.006556583940982819,
+      "step": 1320
+    },
+    {
+      "epoch": 0.45645645645645644,
+      "grad_norm": 0.07012518495321274,
+      "learning_rate": 0.0001890892029317728,
+      "loss": 0.006556279957294464,
+      "step": 1330
+    },
+    {
+      "epoch": 0.4598884598884599,
+      "grad_norm": 0.44485822319984436,
+      "learning_rate": 0.0001889250208163253,
+      "loss": 0.006202537938952446,
+      "step": 1340
+    },
+    {
+      "epoch": 0.46332046332046334,
+      "grad_norm": 0.7814382910728455,
+      "learning_rate": 0.00018875968500567524,
+      "loss": 0.007819340378046036,
+      "step": 1350
+    },
+    {
+      "epoch": 0.46675246675246673,
+      "grad_norm": 2.737605094909668,
+      "learning_rate": 0.00018859319764485594,
+      "loss": 0.007837437093257904,
+      "step": 1360
+    },
+    {
+      "epoch": 0.4701844701844702,
+      "grad_norm": 2.685511350631714,
+      "learning_rate": 0.00018842556089384066,
+      "loss": 0.007956080138683319,
+      "step": 1370
+    },
+    {
+      "epoch": 0.47361647361647363,
+      "grad_norm": 0.1693638116121292,
+      "learning_rate": 0.00018825677692751465,
+      "loss": 0.011585228145122528,
+      "step": 1380
+    },
+    {
+      "epoch": 0.477048477048477,
+      "grad_norm": 0.6162756085395813,
+      "learning_rate": 0.00018808684793564692,
+      "loss": 0.016635045409202576,
+      "step": 1390
+    },
+    {
+      "epoch": 0.4804804804804805,
+      "grad_norm": 0.04063958674669266,
+      "learning_rate": 0.00018791577612286176,
+      "loss": 0.014098620414733887,
+      "step": 1400
+    },
+    {
+      "epoch": 0.4839124839124839,
+      "grad_norm": 0.25344181060791016,
+      "learning_rate": 0.00018774356370861018,
+      "loss": 0.02437097132205963,
+      "step": 1410
+    },
+    {
+      "epoch": 0.4873444873444873,
+      "grad_norm": 0.04818904772400856,
+      "learning_rate": 0.0001875702129271412,
+      "loss": 0.00916079357266426,
+      "step": 1420
+    },
+    {
+      "epoch": 0.49077649077649077,
+      "grad_norm": 0.5269466042518616,
+      "learning_rate": 0.00018739572602747268,
+      "loss": 0.009022314101457596,
+      "step": 1430
+    },
+    {
+      "epoch": 0.4942084942084942,
+      "grad_norm": 0.21276459097862244,
+      "learning_rate": 0.00018722010527336233,
+      "loss": 0.006045453995466232,
+      "step": 1440
+    },
+    {
+      "epoch": 0.4976404976404976,
+      "grad_norm": 0.3551492989063263,
+      "learning_rate": 0.00018704335294327827,
+      "loss": 0.010270991921424865,
+      "step": 1450
+    },
+    {
+      "epoch": 0.5010725010725011,
+      "grad_norm": 0.04858814924955368,
+      "learning_rate": 0.00018686547133036933,
+      "loss": 0.00782613679766655,
+      "step": 1460
+    },
+    {
+      "epoch": 0.5045045045045045,
+      "grad_norm": 0.05507025867700577,
+      "learning_rate": 0.0001866864627424356,
+      "loss": 0.008478378504514694,
+      "step": 1470
+    },
+    {
+      "epoch": 0.5079365079365079,
+      "grad_norm": 0.22750328481197357,
+      "learning_rate": 0.0001865063295018982,
+      "loss": 0.0060363378375768665,
+      "step": 1480
+    },
+    {
+      "epoch": 0.5113685113685114,
+      "grad_norm": 0.06937054544687271,
+      "learning_rate": 0.00018632507394576926,
+      "loss": 0.012915070354938506,
+      "step": 1490
+    },
+    {
+      "epoch": 0.5148005148005148,
+      "grad_norm": 0.019404951483011246,
+      "learning_rate": 0.00018614269842562168,
+      "loss": 0.005564029887318611,
+      "step": 1500
     }
   ],
   "logging_steps": 10,
       "attributes": {}
     }
   },
+  "total_flos": 5689281875559936.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null