Lollo9898 commited on
Commit
67c9814
·
verified ·
1 Parent(s): c3cf1e6

Add lora_tuned

Browse files
lora_tuned/checkpoint-550/adapter_config.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "HuggingFaceM4/Idefics3-8B-Llama3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_lora_weights": "gaussian",
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.1,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "o_proj",
24
+ "text_model.layers.24.self_attn.v_proj",
25
+ "text_model.layers.21.self_attn.k_proj",
26
+ "text_model.layers.4.self_attn.k_proj",
27
+ "text_model.layers.11.self_attn.v_proj",
28
+ "down_proj",
29
+ "text_model.layers.19.self_attn.q_proj",
30
+ "31.self_attn.k_proj",
31
+ "text_model.layers.1.self_attn.v_proj",
32
+ "text_model.layers.21.self_attn.q_proj",
33
+ "text_model.layers.17.self_attn.v_proj",
34
+ "gate_proj",
35
+ "text_model.layers.9.self_attn.q_proj",
36
+ "text_model.layers.9.self_attn.k_proj",
37
+ "text_model.layers.17.self_attn.q_proj",
38
+ "text_model.layers.8.self_attn.v_proj",
39
+ "text_model.layers.3.self_attn.v_proj",
40
+ "text_model.layers.16.self_attn.q_proj",
41
+ "text_model.layers.24.self_attn.q_proj",
42
+ "text_model.layers.9.self_attn.v_proj",
43
+ "text_model.layers.22.self_attn.k_proj",
44
+ "text_model.layers.7.self_attn.q_proj",
45
+ "text_model.layers.12.self_attn.v_proj",
46
+ "30.self_attn.q_proj",
47
+ "text_model.layers.5.self_attn.v_proj",
48
+ "text_model.layers.16.self_attn.v_proj",
49
+ "text_model.layers.5.self_attn.q_proj",
50
+ "text_model.layers.25.self_attn.q_proj",
51
+ "text_model.layers.7.self_attn.v_proj",
52
+ "text_model.layers.26.self_attn.k_proj",
53
+ "text_model.layers.19.self_attn.k_proj",
54
+ "29.self_attn.k_proj",
55
+ "text_model.layers.2.self_attn.v_proj",
56
+ "text_model.layers.10.self_attn.q_proj",
57
+ "lm_head",
58
+ "text_model.layers.3.self_attn.q_proj",
59
+ "up_proj",
60
+ "28.self_attn.q_proj",
61
+ "text_model.layers.8.self_attn.k_proj",
62
+ "text_model.layers.3.self_attn.k_proj",
63
+ "text_model.layers.2.self_attn.q_proj",
64
+ "28.self_attn.k_proj",
65
+ "text_model.layers.10.self_attn.k_proj",
66
+ "text_model.layers.21.self_attn.v_proj",
67
+ "text_model.layers.20.self_attn.v_proj",
68
+ "text_model.layers.17.self_attn.k_proj",
69
+ "text_model.layers.20.self_attn.k_proj",
70
+ "text_model.layers.26.self_attn.q_proj",
71
+ "text_model.layers.0.self_attn.v_proj",
72
+ "text_model.layers.14.self_attn.q_proj",
73
+ "text_model.layers.11.self_attn.k_proj",
74
+ "text_model.layers.22.self_attn.q_proj",
75
+ "text_model.layers.6.self_attn.k_proj",
76
+ "text_model.layers.23.self_attn.k_proj",
77
+ "text_model.layers.2.self_attn.k_proj",
78
+ "text_model.layers.13.self_attn.q_proj",
79
+ "text_model.layers.12.self_attn.k_proj",
80
+ "text_model.layers.15.self_attn.q_proj",
81
+ "text_model.layers.22.self_attn.v_proj",
82
+ "text_model.layers.4.self_attn.v_proj",
83
+ "text_model.layers.18.self_attn.k_proj",
84
+ "text_model.layers.1.self_attn.q_proj",
85
+ "text_model.layers.7.self_attn.k_proj",
86
+ "text_model.layers.25.self_attn.k_proj",
87
+ "text_model.layers.23.self_attn.q_proj",
88
+ "27.self_attn.k_proj",
89
+ "text_model.layers.18.self_attn.v_proj",
90
+ "text_model.layers.18.self_attn.q_proj",
91
+ "29.self_attn.v_proj",
92
+ "text_model.layers.13.self_attn.k_proj",
93
+ "31.self_attn.q_proj",
94
+ "30.self_attn.v_proj",
95
+ "text_model.layers.1.self_attn.k_proj",
96
+ "28.self_attn.v_proj",
97
+ "30.self_attn.k_proj",
98
+ "text_model.layers.13.self_attn.v_proj",
99
+ "text_model.layers.4.self_attn.q_proj",
100
+ "text_model.layers.15.self_attn.v_proj",
101
+ "text_model.layers.20.self_attn.q_proj",
102
+ "text_model.layers.6.self_attn.v_proj",
103
+ "text_model.layers.15.self_attn.k_proj",
104
+ "text_model.layers.16.self_attn.k_proj",
105
+ "text_model.layers.5.self_attn.k_proj",
106
+ "text_model.layers.8.self_attn.q_proj",
107
+ "text_model.layers.19.self_attn.v_proj",
108
+ "text_model.layers.11.self_attn.q_proj",
109
+ "text_model.layers.26.self_attn.v_proj",
110
+ "text_model.layers.0.self_attn.q_proj",
111
+ "27.self_attn.v_proj",
112
+ "text_model.layers.14.self_attn.v_proj",
113
+ "text_model.layers.10.self_attn.v_proj",
114
+ "text_model.layers.14.self_attn.k_proj",
115
+ "text_model.layers.12.self_attn.q_proj",
116
+ "text_model.layers.25.self_attn.v_proj",
117
+ "text_model.layers.24.self_attn.k_proj",
118
+ "text_model.layers.23.self_attn.v_proj",
119
+ "29.self_attn.q_proj",
120
+ "31.self_attn.v_proj",
121
+ "text_model.layers.6.self_attn.q_proj",
122
+ "text_model.layers.0.self_attn.k_proj",
123
+ "27.self_attn.q_proj"
124
+ ],
125
+ "task_type": null,
126
+ "use_dora": false,
127
+ "use_rslora": false
128
+ }
lora_tuned/checkpoint-550/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a893e5198ce0c4cb0d40b2c3db9a13a464fb6e8d7b49e5070113f32af7891e07
3
+ size 2806433816
lora_tuned/checkpoint-550/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 128000,
4
+ "eos_token_id": [
5
+ 128001,
6
+ 128008,
7
+ 128009
8
+ ],
9
+ "pad_token_id": 128002,
10
+ "transformers_version": "4.45.0.dev0"
11
+ }
lora_tuned/checkpoint-550/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22a82660fa93c42531056d01fe9ac9515e3f2f8652a0763a443e57e92edcc2f1
3
+ size 358532508
lora_tuned/checkpoint-550/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b2be11a9f88ececcd0ae9e1231c5a237f298073bebeb469193b8fb3c5a0390d
3
+ size 14244
lora_tuned/checkpoint-550/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15d52c5f43e30771e906b4f1303f7dbf43eccd5c5be57a89b69a4de41941b5f9
3
+ size 1064
lora_tuned/checkpoint-550/trainer_state.json ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5594422817230225,
3
+ "best_model_checkpoint": "checkpoints_gradacc1/checkpoint-550",
4
+ "epoch": 3.3132530120481927,
5
+ "eval_steps": 50,
6
+ "global_step": 550,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.15060240963855423,
13
+ "grad_norm": 4.09214973449707,
14
+ "learning_rate": 5e-06,
15
+ "loss": 2.1029,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.30120481927710846,
20
+ "grad_norm": 2.6950037479400635,
21
+ "learning_rate": 1e-05,
22
+ "loss": 1.554,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.30120481927710846,
27
+ "eval_accuracy": 91.30434782608695,
28
+ "eval_loss": 1.2487136125564575,
29
+ "eval_no_valid_count_percentage": 8.695652173913043,
30
+ "eval_runtime": 36.6526,
31
+ "eval_samples_per_second": 0.628,
32
+ "eval_steps_per_second": 0.628,
33
+ "step": 50
34
+ },
35
+ {
36
+ "epoch": 0.45180722891566266,
37
+ "grad_norm": 1.2071181535720825,
38
+ "learning_rate": 9.844720496894411e-06,
39
+ "loss": 1.0399,
40
+ "step": 75
41
+ },
42
+ {
43
+ "epoch": 0.6024096385542169,
44
+ "grad_norm": 1.5161505937576294,
45
+ "learning_rate": 9.68944099378882e-06,
46
+ "loss": 0.9261,
47
+ "step": 100
48
+ },
49
+ {
50
+ "epoch": 0.6024096385542169,
51
+ "eval_accuracy": 100.0,
52
+ "eval_loss": 0.8182541131973267,
53
+ "eval_no_valid_count_percentage": 0.0,
54
+ "eval_runtime": 36.2518,
55
+ "eval_samples_per_second": 0.634,
56
+ "eval_steps_per_second": 0.634,
57
+ "step": 100
58
+ },
59
+ {
60
+ "epoch": 0.7530120481927711,
61
+ "grad_norm": 1.3708404302597046,
62
+ "learning_rate": 9.53416149068323e-06,
63
+ "loss": 0.9339,
64
+ "step": 125
65
+ },
66
+ {
67
+ "epoch": 0.9036144578313253,
68
+ "grad_norm": 2.2083890438079834,
69
+ "learning_rate": 9.37888198757764e-06,
70
+ "loss": 0.7509,
71
+ "step": 150
72
+ },
73
+ {
74
+ "epoch": 0.9036144578313253,
75
+ "eval_accuracy": 100.0,
76
+ "eval_loss": 0.7183033227920532,
77
+ "eval_no_valid_count_percentage": 0.0,
78
+ "eval_runtime": 36.3217,
79
+ "eval_samples_per_second": 0.633,
80
+ "eval_steps_per_second": 0.633,
81
+ "step": 150
82
+ },
83
+ {
84
+ "epoch": 1.0542168674698795,
85
+ "grad_norm": 1.8775794506072998,
86
+ "learning_rate": 9.22360248447205e-06,
87
+ "loss": 0.7564,
88
+ "step": 175
89
+ },
90
+ {
91
+ "epoch": 1.2048192771084336,
92
+ "grad_norm": 1.949776291847229,
93
+ "learning_rate": 9.068322981366461e-06,
94
+ "loss": 0.8138,
95
+ "step": 200
96
+ },
97
+ {
98
+ "epoch": 1.2048192771084336,
99
+ "eval_accuracy": 100.0,
100
+ "eval_loss": 0.6786131858825684,
101
+ "eval_no_valid_count_percentage": 0.0,
102
+ "eval_runtime": 36.5439,
103
+ "eval_samples_per_second": 0.629,
104
+ "eval_steps_per_second": 0.629,
105
+ "step": 200
106
+ },
107
+ {
108
+ "epoch": 1.355421686746988,
109
+ "grad_norm": 2.1677873134613037,
110
+ "learning_rate": 8.91304347826087e-06,
111
+ "loss": 0.671,
112
+ "step": 225
113
+ },
114
+ {
115
+ "epoch": 1.5060240963855422,
116
+ "grad_norm": 1.3840441703796387,
117
+ "learning_rate": 8.75776397515528e-06,
118
+ "loss": 0.767,
119
+ "step": 250
120
+ },
121
+ {
122
+ "epoch": 1.5060240963855422,
123
+ "eval_accuracy": 100.0,
124
+ "eval_loss": 0.6524815559387207,
125
+ "eval_no_valid_count_percentage": 0.0,
126
+ "eval_runtime": 36.2966,
127
+ "eval_samples_per_second": 0.634,
128
+ "eval_steps_per_second": 0.634,
129
+ "step": 250
130
+ },
131
+ {
132
+ "epoch": 1.6566265060240963,
133
+ "grad_norm": 2.0291168689727783,
134
+ "learning_rate": 8.60248447204969e-06,
135
+ "loss": 0.6195,
136
+ "step": 275
137
+ },
138
+ {
139
+ "epoch": 1.8072289156626506,
140
+ "grad_norm": 1.4875630140304565,
141
+ "learning_rate": 8.4472049689441e-06,
142
+ "loss": 0.6076,
143
+ "step": 300
144
+ },
145
+ {
146
+ "epoch": 1.8072289156626506,
147
+ "eval_accuracy": 100.0,
148
+ "eval_loss": 0.612907350063324,
149
+ "eval_no_valid_count_percentage": 0.0,
150
+ "eval_runtime": 36.4055,
151
+ "eval_samples_per_second": 0.632,
152
+ "eval_steps_per_second": 0.632,
153
+ "step": 300
154
+ },
155
+ {
156
+ "epoch": 1.9578313253012047,
157
+ "grad_norm": 1.90384042263031,
158
+ "learning_rate": 8.29192546583851e-06,
159
+ "loss": 0.5857,
160
+ "step": 325
161
+ },
162
+ {
163
+ "epoch": 2.108433734939759,
164
+ "grad_norm": 2.606180191040039,
165
+ "learning_rate": 8.13664596273292e-06,
166
+ "loss": 0.5763,
167
+ "step": 350
168
+ },
169
+ {
170
+ "epoch": 2.108433734939759,
171
+ "eval_accuracy": 100.0,
172
+ "eval_loss": 0.6015170812606812,
173
+ "eval_no_valid_count_percentage": 0.0,
174
+ "eval_runtime": 36.3853,
175
+ "eval_samples_per_second": 0.632,
176
+ "eval_steps_per_second": 0.632,
177
+ "step": 350
178
+ },
179
+ {
180
+ "epoch": 2.2590361445783134,
181
+ "grad_norm": 2.140429735183716,
182
+ "learning_rate": 7.98136645962733e-06,
183
+ "loss": 0.4908,
184
+ "step": 375
185
+ },
186
+ {
187
+ "epoch": 2.4096385542168672,
188
+ "grad_norm": 3.0130579471588135,
189
+ "learning_rate": 7.82608695652174e-06,
190
+ "loss": 0.4804,
191
+ "step": 400
192
+ },
193
+ {
194
+ "epoch": 2.4096385542168672,
195
+ "eval_accuracy": 100.0,
196
+ "eval_loss": 0.6012862920761108,
197
+ "eval_no_valid_count_percentage": 0.0,
198
+ "eval_runtime": 36.3165,
199
+ "eval_samples_per_second": 0.633,
200
+ "eval_steps_per_second": 0.633,
201
+ "step": 400
202
+ },
203
+ {
204
+ "epoch": 2.5602409638554215,
205
+ "grad_norm": 2.587240695953369,
206
+ "learning_rate": 7.670807453416149e-06,
207
+ "loss": 0.4906,
208
+ "step": 425
209
+ },
210
+ {
211
+ "epoch": 2.710843373493976,
212
+ "grad_norm": 2.6261508464813232,
213
+ "learning_rate": 7.515527950310559e-06,
214
+ "loss": 0.5931,
215
+ "step": 450
216
+ },
217
+ {
218
+ "epoch": 2.710843373493976,
219
+ "eval_accuracy": 100.0,
220
+ "eval_loss": 0.5865727066993713,
221
+ "eval_no_valid_count_percentage": 0.0,
222
+ "eval_runtime": 36.3197,
223
+ "eval_samples_per_second": 0.633,
224
+ "eval_steps_per_second": 0.633,
225
+ "step": 450
226
+ },
227
+ {
228
+ "epoch": 2.86144578313253,
229
+ "grad_norm": 2.947445869445801,
230
+ "learning_rate": 7.36024844720497e-06,
231
+ "loss": 0.5575,
232
+ "step": 475
233
+ },
234
+ {
235
+ "epoch": 3.0120481927710845,
236
+ "grad_norm": 1.6870458126068115,
237
+ "learning_rate": 7.2049689440993795e-06,
238
+ "loss": 0.5763,
239
+ "step": 500
240
+ },
241
+ {
242
+ "epoch": 3.0120481927710845,
243
+ "eval_accuracy": 100.0,
244
+ "eval_loss": 0.5629354119300842,
245
+ "eval_no_valid_count_percentage": 0.0,
246
+ "eval_runtime": 36.3522,
247
+ "eval_samples_per_second": 0.633,
248
+ "eval_steps_per_second": 0.633,
249
+ "step": 500
250
+ },
251
+ {
252
+ "epoch": 3.1626506024096384,
253
+ "grad_norm": 4.26260232925415,
254
+ "learning_rate": 7.04968944099379e-06,
255
+ "loss": 0.4701,
256
+ "step": 525
257
+ },
258
+ {
259
+ "epoch": 3.3132530120481927,
260
+ "grad_norm": 2.8531856536865234,
261
+ "learning_rate": 6.894409937888199e-06,
262
+ "loss": 0.4484,
263
+ "step": 550
264
+ },
265
+ {
266
+ "epoch": 3.3132530120481927,
267
+ "eval_accuracy": 100.0,
268
+ "eval_loss": 0.5594422817230225,
269
+ "eval_no_valid_count_percentage": 0.0,
270
+ "eval_runtime": 36.3228,
271
+ "eval_samples_per_second": 0.633,
272
+ "eval_steps_per_second": 0.633,
273
+ "step": 550
274
+ }
275
+ ],
276
+ "logging_steps": 25,
277
+ "max_steps": 1660,
278
+ "num_input_tokens_seen": 0,
279
+ "num_train_epochs": 10,
280
+ "save_steps": 50,
281
+ "stateful_callbacks": {
282
+ "EarlyStoppingCallback": {
283
+ "args": {
284
+ "early_stopping_patience": 3,
285
+ "early_stopping_threshold": 0.001
286
+ },
287
+ "attributes": {
288
+ "early_stopping_patience_counter": 0
289
+ }
290
+ },
291
+ "TrainerControl": {
292
+ "args": {
293
+ "should_epoch_stop": false,
294
+ "should_evaluate": false,
295
+ "should_log": false,
296
+ "should_save": true,
297
+ "should_training_stop": false
298
+ },
299
+ "attributes": {}
300
+ }
301
+ },
302
+ "total_flos": 1.3664779779290515e+17,
303
+ "train_batch_size": 1,
304
+ "trial_name": null,
305
+ "trial_params": null
306
+ }
lora_tuned/checkpoint-550/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad3251001be4ae1e880dd371b8d8607c21ba2e6f7c08da23ed8f2f907094ad1
3
+ size 5304