alicegoesdown commited on
Commit
6e04ffd
·
verified ·
1 Parent(s): 0429dd7

Training in progress, step 150, checkpoint

Browse files
last-checkpoint/lora_lower/adapter_config.json CHANGED
@@ -216,62 +216,62 @@
216
  },
217
  "revision": null,
218
  "target_modules": [
219
- "transformer.h.10.mlp.dense_4h_to_h",
 
 
 
 
 
 
 
 
 
 
 
220
  "transformer.h.3.mlp.dense_h_to_4h",
221
- "transformer.h.9.self_attention.dense",
222
- "transformer.h.6.mlp.dense_4h_to_h",
 
 
 
223
  "transformer.h.11.self_attention.dense",
 
 
224
  "transformer.h.12.mlp.dense_h_to_4h",
225
- "transformer.h.0.mlp.dense_4h_to_h",
226
- "transformer.h.0.self_attention.dense",
227
- "transformer.h.8.mlp.dense_h_to_4h",
228
- "transformer.h.13.self_attention.dense",
229
- "transformer.h.1.self_attention.dense",
230
- "transformer.h.7.mlp.dense_4h_to_h",
231
- "transformer.h.1.mlp.dense_4h_to_h",
232
- "transformer.h.0.self_attention.query_key_value",
233
- "transformer.h.11.self_attention.query_key_value",
234
  "transformer.h.12.mlp.dense_4h_to_h",
235
- "transformer.h.6.mlp.dense_h_to_4h",
236
- "transformer.h.7.self_attention.dense",
237
- "transformer.h.8.mlp.dense_4h_to_h",
238
  "transformer.h.11.mlp.dense_h_to_4h",
239
- "transformer.h.13.self_attention.query_key_value",
240
- "transformer.h.1.self_attention.query_key_value",
241
- "transformer.h.9.mlp.dense_h_to_4h",
242
- "transformer.h.4.mlp.dense_4h_to_h",
243
- "transformer.h.7.self_attention.query_key_value",
244
- "transformer.h.4.mlp.dense_h_to_4h",
245
  "transformer.h.1.mlp.dense_h_to_4h",
246
- "transformer.h.11.mlp.dense_4h_to_h",
247
- "transformer.h.2.mlp.dense_h_to_4h",
248
- "transformer.h.12.self_attention.query_key_value",
249
- "transformer.h.3.self_attention.query_key_value",
250
- "transformer.h.2.self_attention.query_key_value",
251
- "transformer.h.2.self_attention.dense",
252
- "transformer.h.2.mlp.dense_4h_to_h",
253
- "transformer.h.4.self_attention.query_key_value",
254
- "transformer.h.8.self_attention.query_key_value",
255
- "transformer.h.4.self_attention.dense",
256
- "transformer.h.13.mlp.dense_4h_to_h",
257
- "transformer.h.10.mlp.dense_h_to_4h",
258
  "transformer.h.5.mlp.dense_h_to_4h",
 
 
 
 
 
259
  "transformer.h.9.self_attention.query_key_value",
260
- "transformer.h.7.mlp.dense_h_to_4h",
261
- "transformer.h.5.self_attention.query_key_value",
262
- "transformer.h.5.self_attention.dense",
263
- "transformer.h.3.mlp.dense_4h_to_h",
264
- "transformer.h.0.mlp.dense_h_to_4h",
265
- "transformer.h.6.self_attention.query_key_value",
 
 
266
  "transformer.h.5.mlp.dense_4h_to_h",
267
- "transformer.h.9.mlp.dense_4h_to_h",
268
- "transformer.h.10.self_attention.query_key_value",
269
- "transformer.h.12.self_attention.dense",
270
- "transformer.h.6.self_attention.dense",
271
  "transformer.h.8.self_attention.dense",
272
- "transformer.h.10.self_attention.dense",
 
 
 
273
  "transformer.h.3.self_attention.dense",
274
- "transformer.h.13.mlp.dense_h_to_4h"
 
 
 
 
275
  ],
276
  "task_type": "CAUSAL_LM",
277
  "use_dora": false,
 
216
  },
217
  "revision": null,
218
  "target_modules": [
219
+ "transformer.h.8.mlp.dense_4h_to_h",
220
+ "transformer.h.6.self_attention.query_key_value",
221
+ "transformer.h.4.mlp.dense_h_to_4h",
222
+ "transformer.h.11.mlp.dense_4h_to_h",
223
+ "transformer.h.5.self_attention.query_key_value",
224
+ "transformer.h.10.mlp.dense_h_to_4h",
225
+ "transformer.h.4.self_attention.query_key_value",
226
+ "transformer.h.4.mlp.dense_4h_to_h",
227
+ "transformer.h.1.mlp.dense_4h_to_h",
228
+ "transformer.h.13.mlp.dense_4h_to_h",
229
+ "transformer.h.6.self_attention.dense",
230
+ "transformer.h.13.mlp.dense_h_to_4h",
231
  "transformer.h.3.mlp.dense_h_to_4h",
232
+ "transformer.h.2.mlp.dense_4h_to_h",
233
+ "transformer.h.9.mlp.dense_4h_to_h",
234
+ "transformer.h.7.self_attention.dense",
235
+ "transformer.h.8.self_attention.query_key_value",
236
+ "transformer.h.6.mlp.dense_h_to_4h",
237
  "transformer.h.11.self_attention.dense",
238
+ "transformer.h.10.mlp.dense_4h_to_h",
239
+ "transformer.h.12.self_attention.query_key_value",
240
  "transformer.h.12.mlp.dense_h_to_4h",
 
 
 
 
 
 
 
 
 
241
  "transformer.h.12.mlp.dense_4h_to_h",
242
+ "transformer.h.0.mlp.dense_h_to_4h",
 
 
243
  "transformer.h.11.mlp.dense_h_to_4h",
244
+ "transformer.h.7.mlp.dense_h_to_4h",
 
 
 
 
 
245
  "transformer.h.1.mlp.dense_h_to_4h",
246
+ "transformer.h.13.self_attention.query_key_value",
247
+ "transformer.h.9.self_attention.dense",
 
 
 
 
 
 
 
 
 
 
248
  "transformer.h.5.mlp.dense_h_to_4h",
249
+ "transformer.h.10.self_attention.dense",
250
+ "transformer.h.0.mlp.dense_4h_to_h",
251
+ "transformer.h.11.self_attention.query_key_value",
252
+ "transformer.h.2.mlp.dense_h_to_4h",
253
+ "transformer.h.10.self_attention.query_key_value",
254
  "transformer.h.9.self_attention.query_key_value",
255
+ "transformer.h.8.mlp.dense_h_to_4h",
256
+ "transformer.h.0.self_attention.query_key_value",
257
+ "transformer.h.0.self_attention.dense",
258
+ "transformer.h.4.self_attention.dense",
259
+ "transformer.h.13.self_attention.dense",
260
+ "transformer.h.2.self_attention.query_key_value",
261
+ "transformer.h.3.self_attention.query_key_value",
262
+ "transformer.h.7.self_attention.query_key_value",
263
  "transformer.h.5.mlp.dense_4h_to_h",
 
 
 
 
264
  "transformer.h.8.self_attention.dense",
265
+ "transformer.h.9.mlp.dense_h_to_4h",
266
+ "transformer.h.6.mlp.dense_4h_to_h",
267
+ "transformer.h.7.mlp.dense_4h_to_h",
268
+ "transformer.h.3.mlp.dense_4h_to_h",
269
  "transformer.h.3.self_attention.dense",
270
+ "transformer.h.1.self_attention.query_key_value",
271
+ "transformer.h.1.self_attention.dense",
272
+ "transformer.h.5.self_attention.dense",
273
+ "transformer.h.12.self_attention.dense",
274
+ "transformer.h.2.self_attention.dense"
275
  ],
276
  "task_type": "CAUSAL_LM",
277
  "use_dora": false,
last-checkpoint/lora_lower/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c71de63d57b7240e7fbd8d372357da2c05fa7745f15eba2264834b73152da4e1
3
  size 2058899176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1843627ca7f5c8892cc27ec365b7b71475f9c7c28bf3db1528f68975cab934af
3
  size 2058899176
last-checkpoint/lora_middle/adapter_config.json CHANGED
@@ -216,34 +216,34 @@
216
  },
217
  "revision": null,
218
  "target_modules": [
219
- "transformer.h.19.mlp.dense_h_to_4h",
220
- "transformer.h.20.mlp.dense_4h_to_h",
221
- "transformer.h.17.self_attention.dense",
222
- "transformer.h.19.mlp.dense_4h_to_h",
223
- "transformer.h.16.mlp.dense_4h_to_h",
224
- "transformer.h.20.self_attention.dense",
225
  "transformer.h.20.mlp.dense_h_to_4h",
226
- "transformer.h.17.self_attention.query_key_value",
227
- "transformer.h.15.self_attention.dense",
228
- "transformer.h.18.mlp.dense_4h_to_h",
229
  "transformer.h.20.self_attention.query_key_value",
 
230
  "transformer.h.16.mlp.dense_h_to_4h",
231
  "transformer.h.19.self_attention.dense",
232
- "transformer.h.14.self_attention.query_key_value",
233
- "transformer.h.15.mlp.dense_4h_to_h",
234
- "transformer.h.17.mlp.dense_h_to_4h",
235
- "transformer.h.16.self_attention.dense",
236
- "transformer.h.17.mlp.dense_4h_to_h",
 
237
  "transformer.h.19.self_attention.query_key_value",
238
- "transformer.h.18.self_attention.query_key_value",
239
- "transformer.h.18.mlp.dense_h_to_4h",
240
- "transformer.h.14.mlp.dense_h_to_4h",
241
  "transformer.h.18.self_attention.dense",
242
- "transformer.h.14.self_attention.dense",
243
- "transformer.h.15.mlp.dense_h_to_4h",
244
- "transformer.h.14.mlp.dense_4h_to_h",
245
  "transformer.h.16.self_attention.query_key_value",
246
- "transformer.h.15.self_attention.query_key_value"
 
 
 
 
 
 
 
 
 
247
  ],
248
  "task_type": "CAUSAL_LM",
249
  "use_dora": false,
 
216
  },
217
  "revision": null,
218
  "target_modules": [
 
 
 
 
 
 
219
  "transformer.h.20.mlp.dense_h_to_4h",
220
+ "transformer.h.17.mlp.dense_h_to_4h",
221
+ "transformer.h.14.self_attention.dense",
 
222
  "transformer.h.20.self_attention.query_key_value",
223
+ "transformer.h.14.mlp.dense_4h_to_h",
224
  "transformer.h.16.mlp.dense_h_to_4h",
225
  "transformer.h.19.self_attention.dense",
226
+ "transformer.h.20.mlp.dense_4h_to_h",
227
+ "transformer.h.15.mlp.dense_h_to_4h",
228
+ "transformer.h.15.self_attention.query_key_value",
229
+ "transformer.h.18.mlp.dense_4h_to_h",
230
+ "transformer.h.16.mlp.dense_4h_to_h",
231
+ "transformer.h.19.mlp.dense_h_to_4h",
232
  "transformer.h.19.self_attention.query_key_value",
233
+ "transformer.h.15.self_attention.dense",
 
 
234
  "transformer.h.18.self_attention.dense",
235
+ "transformer.h.17.mlp.dense_4h_to_h",
 
 
236
  "transformer.h.16.self_attention.query_key_value",
237
+ "transformer.h.19.mlp.dense_4h_to_h",
238
+ "transformer.h.20.self_attention.dense",
239
+ "transformer.h.18.self_attention.query_key_value",
240
+ "transformer.h.15.mlp.dense_4h_to_h",
241
+ "transformer.h.14.mlp.dense_h_to_4h",
242
+ "transformer.h.17.self_attention.dense",
243
+ "transformer.h.16.self_attention.dense",
244
+ "transformer.h.18.mlp.dense_h_to_4h",
245
+ "transformer.h.17.self_attention.query_key_value",
246
+ "transformer.h.14.self_attention.query_key_value"
247
  ],
248
  "task_type": "CAUSAL_LM",
249
  "use_dora": false,
last-checkpoint/lora_middle/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2213d810c5041f54706922363673f8eb5b59dcb31caea318ab503635efa556b8
3
  size 2058889288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2705a293473662f885bc2c9b4b1643921a5f4db0ad8025e88a7dcd2aa0221f5
3
  size 2058889288
last-checkpoint/lora_top/adapter_config.json CHANGED
@@ -216,18 +216,18 @@
216
  },
217
  "revision": null,
218
  "target_modules": [
219
- "transformer.h.23.self_attention.query_key_value",
220
- "transformer.h.23.self_attention.dense",
221
- "transformer.h.23.mlp.dense_4h_to_h",
222
  "transformer.h.21.self_attention.query_key_value",
223
- "transformer.h.22.mlp.dense_4h_to_h",
224
- "transformer.h.22.mlp.dense_h_to_4h",
225
  "transformer.h.21.mlp.dense_4h_to_h",
226
  "transformer.h.22.self_attention.dense",
227
- "transformer.h.21.self_attention.dense",
 
 
228
  "transformer.h.21.mlp.dense_h_to_4h",
229
- "transformer.h.22.self_attention.query_key_value",
230
- "transformer.h.23.mlp.dense_h_to_4h"
231
  ],
232
  "task_type": "CAUSAL_LM",
233
  "use_dora": false,
 
216
  },
217
  "revision": null,
218
  "target_modules": [
219
+ "transformer.h.21.self_attention.dense",
220
+ "transformer.h.22.self_attention.query_key_value",
 
221
  "transformer.h.21.self_attention.query_key_value",
222
+ "transformer.h.23.self_attention.query_key_value",
 
223
  "transformer.h.21.mlp.dense_4h_to_h",
224
  "transformer.h.22.self_attention.dense",
225
+ "transformer.h.23.self_attention.dense",
226
+ "transformer.h.22.mlp.dense_4h_to_h",
227
+ "transformer.h.23.mlp.dense_h_to_4h",
228
  "transformer.h.21.mlp.dense_h_to_4h",
229
+ "transformer.h.23.mlp.dense_4h_to_h",
230
+ "transformer.h.22.mlp.dense_h_to_4h"
231
  ],
232
  "task_type": "CAUSAL_LM",
233
  "use_dora": false,
last-checkpoint/lora_top/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d4aea319a00da26eded5fb5f7b6e929fc49a6e7d9f7345dd93d7aa68210429b
3
  size 2058359328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a788bb0ebd30119e21444b5ca652ef18fb146fe392e34129c932ff9be592f7a
3
  size 2058359328
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3eee6627247442ca4d139e5d97b358ae844921bc8a42352b9613fadfdcea5ccb
3
- size 2061521939
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:839291e7974e74a87e53a123a3881b8e276d083b1f2193ef431e4dc80bb107c3
3
+ size 2061522259
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab0b438a7c81238d9b63833459e05c2d46a240a047ece3e60377286affa39c62
3
- size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f51347b38751994e31f402f6d1cbfdce41c21e3b2e0fd15f1fdf02faa3c7d5
3
+ size 1256
last-checkpoint/trainer_state.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "best_metric": 4.293993949890137,
3
  "best_model_checkpoint": "./output/checkpoint-150",
4
  "epoch": 0.02449779519843214,
5
  "eval_steps": 150,
@@ -10,115 +10,115 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0016331863465621427,
13
- "grad_norm": 53.87973403930664,
14
- "learning_rate": 5e-06,
15
- "loss": 4.9073,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.0032663726931242854,
20
- "grad_norm": 45.79116439819336,
21
- "learning_rate": 1e-05,
22
- "loss": 4.551,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.004899559039686428,
27
- "grad_norm": 51.187843322753906,
28
- "learning_rate": 1.5e-05,
29
- "loss": 4.6312,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.006532745386248571,
34
- "grad_norm": 50.05643081665039,
35
- "learning_rate": 2e-05,
36
- "loss": 4.6128,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.008165931732810714,
41
- "grad_norm": 49.844581604003906,
42
- "learning_rate": 2.5e-05,
43
- "loss": 4.5007,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.009799118079372856,
48
- "grad_norm": 53.74774169921875,
49
- "learning_rate": 3e-05,
50
- "loss": 4.4041,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.011432304425935,
55
- "grad_norm": 52.074703216552734,
56
- "learning_rate": 3.5e-05,
57
- "loss": 4.4002,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.013065490772497142,
62
- "grad_norm": 51.11018371582031,
63
- "learning_rate": 4e-05,
64
- "loss": 4.3201,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.014698677119059285,
69
- "grad_norm": 54.45034408569336,
70
- "learning_rate": 4.5e-05,
71
- "loss": 4.2208,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.01633186346562143,
76
- "grad_norm": 55.54214859008789,
77
- "learning_rate": 5e-05,
78
- "loss": 4.3211,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.01796504981218357,
83
- "grad_norm": 60.387210845947266,
84
- "learning_rate": 4.999948617395915e-05,
85
- "loss": 4.2806,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.019598236158745713,
90
- "grad_norm": 55.413124084472656,
91
- "learning_rate": 4.9997944716957985e-05,
92
- "loss": 4.2696,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.021231422505307854,
97
- "grad_norm": 58.15327835083008,
98
- "learning_rate": 4.9995375692359755e-05,
99
- "loss": 4.2277,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.02286460885187,
104
- "grad_norm": 54.6749153137207,
105
- "learning_rate": 4.9991779205767e-05,
106
- "loss": 4.1476,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.02449779519843214,
111
- "grad_norm": 58.392120361328125,
112
- "learning_rate": 4.99871554050172e-05,
113
- "loss": 4.2109,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.02449779519843214,
118
- "eval_loss": 4.293993949890137,
119
- "eval_runtime": 12.3191,
120
- "eval_samples_per_second": 40.587,
121
- "eval_steps_per_second": 40.587,
122
  "step": 150
123
  }
124
  ],
 
1
  {
2
+ "best_metric": 4.046905040740967,
3
  "best_model_checkpoint": "./output/checkpoint-150",
4
  "epoch": 0.02449779519843214,
5
  "eval_steps": 150,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0016331863465621427,
13
+ "grad_norm": 60.602169036865234,
14
+ "learning_rate": 2.154434690031884e-06,
15
+ "loss": 4.2742,
16
  "step": 10
17
  },
18
  {
19
  "epoch": 0.0032663726931242854,
20
+ "grad_norm": 57.758113861083984,
21
+ "learning_rate": 4.308869380063768e-06,
22
+ "loss": 4.0288,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.004899559039686428,
27
+ "grad_norm": 55.49625778198242,
28
+ "learning_rate": 6.463304070095652e-06,
29
+ "loss": 4.1612,
30
  "step": 30
31
  },
32
  {
33
  "epoch": 0.006532745386248571,
34
+ "grad_norm": 46.22760772705078,
35
+ "learning_rate": 8.617738760127536e-06,
36
+ "loss": 4.1069,
37
  "step": 40
38
  },
39
  {
40
  "epoch": 0.008165931732810714,
41
+ "grad_norm": 47.324954986572266,
42
+ "learning_rate": 1.077217345015942e-05,
43
+ "loss": 4.0317,
44
  "step": 50
45
  },
46
  {
47
  "epoch": 0.009799118079372856,
48
+ "grad_norm": 46.110965728759766,
49
+ "learning_rate": 1.2926608140191304e-05,
50
+ "loss": 3.9325,
51
  "step": 60
52
  },
53
  {
54
  "epoch": 0.011432304425935,
55
+ "grad_norm": 47.741973876953125,
56
+ "learning_rate": 1.5081042830223187e-05,
57
+ "loss": 3.9713,
58
  "step": 70
59
  },
60
  {
61
  "epoch": 0.013065490772497142,
62
+ "grad_norm": 40.646671295166016,
63
+ "learning_rate": 1.723547752025507e-05,
64
+ "loss": 3.9214,
65
  "step": 80
66
  },
67
  {
68
  "epoch": 0.014698677119059285,
69
+ "grad_norm": 44.510902404785156,
70
+ "learning_rate": 1.9389912210286956e-05,
71
+ "loss": 3.9046,
72
  "step": 90
73
  },
74
  {
75
  "epoch": 0.01633186346562143,
76
+ "grad_norm": 43.503135681152344,
77
+ "learning_rate": 2.154434690031884e-05,
78
+ "loss": 3.971,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.01796504981218357,
83
+ "grad_norm": 53.51116180419922,
84
+ "learning_rate": 2.154412549938943e-05,
85
+ "loss": 3.9408,
86
  "step": 110
87
  },
88
  {
89
  "epoch": 0.019598236158745713,
90
+ "grad_norm": 45.50636672973633,
91
+ "learning_rate": 2.1543461305702127e-05,
92
+ "loss": 3.955,
93
  "step": 120
94
  },
95
  {
96
  "epoch": 0.021231422505307854,
97
+ "grad_norm": 45.04021072387695,
98
+ "learning_rate": 2.1542354346559332e-05,
99
+ "loss": 3.9399,
100
  "step": 130
101
  },
102
  {
103
  "epoch": 0.02286460885187,
104
+ "grad_norm": 41.114078521728516,
105
+ "learning_rate": 2.15408046674638e-05,
106
+ "loss": 3.857,
107
  "step": 140
108
  },
109
  {
110
  "epoch": 0.02449779519843214,
111
+ "grad_norm": 55.586185455322266,
112
+ "learning_rate": 2.1538812332116767e-05,
113
+ "loss": 3.9741,
114
  "step": 150
115
  },
116
  {
117
  "epoch": 0.02449779519843214,
118
+ "eval_loss": 4.046905040740967,
119
+ "eval_runtime": 12.6477,
120
+ "eval_samples_per_second": 39.533,
121
+ "eval_steps_per_second": 39.533,
122
  "step": 150
123
  }
124
  ],
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8b68c731a984c7f0b77d284b1920ecf8f7068a1a51d636a4e6cfc2dbaf30e2b
3
- size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f40345b3fd623d3ea644b5cebfee50c24fbc32ad1c4eb11de85e1691db783a37
3
+ size 5496