Navneetkumar11 commited on
Commit
9341b57
·
verified ·
1 Parent(s): 8ba2b6c

Training in progress, epoch 1

Browse files
README.md CHANGED
@@ -30,14 +30,15 @@ print(output["generated_text"])
30
 
31
 
32
 
 
33
  This model was trained with SFT.
34
 
35
  ### Framework versions
36
 
37
- - TRL: 0.23.1
38
  - Transformers: 5.5.0
39
- - Pytorch: 2.11.0
40
- - Datasets: 4.0.0
41
  - Tokenizers: 0.22.2
42
 
43
  ## Citations
@@ -47,12 +48,11 @@ This model was trained with SFT.
47
  Cite TRL as:
48
 
49
  ```bibtex
50
- @misc{vonwerra2022trl,
51
- title = {{TRL: Transformer Reinforcement Learning}},
52
- author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
53
- year = 2020,
54
- journal = {GitHub repository},
55
- publisher = {GitHub},
56
- howpublished = {\url{https://github.com/huggingface/trl}}
57
  }
58
  ```
 
30
 
31
 
32
 
33
+
34
  This model was trained with SFT.
35
 
36
  ### Framework versions
37
 
38
+ - TRL: 1.0.0
39
  - Transformers: 5.5.0
40
+ - Pytorch: 2.10.0+cu128
41
+ - Datasets: 4.8.4
42
  - Tokenizers: 0.22.2
43
 
44
  ## Citations
 
48
  Cite TRL as:
49
 
50
  ```bibtex
51
+ @software{vonwerra2020trl,
52
+ title = {{TRL: Transformers Reinforcement Learning}},
53
+ author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin},
54
+ license = {Apache-2.0},
55
+ url = {https://github.com/huggingface/trl},
56
+ year = {2020}
 
57
  }
58
  ```
adapter_config.json CHANGED
@@ -32,260 +32,260 @@
32
  "rank_pattern": {},
33
  "revision": null,
34
  "target_modules": [
35
- "language_model.layers.12.mlp.down_proj",
36
- "34.mlp.gate_proj",
37
- "31.mlp.up_proj",
38
- "27.self_attn.k_proj",
39
- "language_model.layers.12.self_attn.v_proj",
40
- "language_model.layers.14.self_attn.q_proj",
41
- "language_model.layers.1.self_attn.k_proj",
42
- "23.self_attn.q_proj",
43
- "24.self_attn.v_proj",
44
- "21.self_attn.q_proj",
45
  "language_model.layers.8.mlp.gate_proj",
46
- "language_model.layers.15.mlp.gate_proj",
47
- "33.self_attn.q_proj",
48
- "language_model.layers.10.mlp.gate_proj",
49
- "17.mlp.down_proj",
50
- "18.self_attn.q_proj",
51
- "16.mlp.up_proj",
52
- "20.self_attn.q_proj",
53
- "33.mlp.gate_proj",
54
- "23.mlp.gate_proj",
55
- "22.self_attn.k_proj",
56
- "language_model.layers.3.self_attn.v_proj",
57
- "18.mlp.down_proj",
58
- "language_model.layers.0.self_attn.k_proj",
59
- "24.self_attn.o_proj",
60
- "linear",
61
- "29.self_attn.q_proj",
62
- "17.self_attn.v_proj",
63
  "language_model.layers.2.mlp.gate_proj",
64
- "30.self_attn.k_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  "20.self_attn.o_proj",
66
- "language_model.layers.6.self_attn.q_proj",
 
67
  "25.self_attn.o_proj",
68
- "language_model.layers.7.self_attn.v_proj",
69
- "language_model.layers.6.self_attn.k_proj",
70
- "22.mlp.up_proj",
71
- "language_model.layers.11.self_attn.k_proj",
 
 
 
 
 
72
  "18.self_attn.o_proj",
73
- "language_model.layers.4.mlp.down_proj",
74
- "26.mlp.down_proj",
75
- "19.self_attn.k_proj",
76
- "per_layer_model_projection",
77
- "34.self_attn.k_proj",
78
- "21.self_attn.o_proj",
79
- "language_model.layers.5.self_attn.v_proj",
80
- "25.self_attn.v_proj",
81
- "16.self_attn.k_proj",
82
- "19.mlp.gate_proj",
83
- "language_model.layers.10.self_attn.v_proj",
84
- "language_model.layers.9.self_attn.q_proj",
85
- "language_model.layers.15.self_attn.o_proj",
86
  "language_model.layers.8.self_attn.k_proj",
87
- "28.self_attn.o_proj",
88
- "language_model.layers.9.self_attn.v_proj",
89
- "language_model.layers.1.self_attn.o_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  "27.self_attn.q_proj",
91
- "24.self_attn.k_proj",
 
 
92
  "relative_k_proj",
93
- "language_model.layers.11.mlp.up_proj",
94
- "19.self_attn.v_proj",
95
- "language_model.layers.6.self_attn.o_proj",
96
- "language_model.layers.2.self_attn.q_proj",
97
- "32.self_attn.o_proj",
98
- "19.mlp.down_proj",
99
- "30.self_attn.o_proj",
100
- "language_model.layers.15.self_attn.v_proj",
101
- "26.self_attn.v_proj",
102
- "embedding_projection",
103
- "31.mlp.gate_proj",
104
- "language_model.layers.4.self_attn.o_proj",
105
- "21.mlp.up_proj",
106
- "27.mlp.up_proj",
107
- "26.mlp.gate_proj",
108
- "input_proj_linear",
109
- "18.self_attn.v_proj",
110
- "28.mlp.gate_proj",
111
- "29.self_attn.k_proj",
112
  "language_model.layers.11.mlp.gate_proj",
113
- "language_model.layers.9.self_attn.o_proj",
114
- "16.mlp.down_proj",
115
- "34.mlp.down_proj",
116
- "language_model.layers.0.self_attn.o_proj",
117
- "language_model.layers.13.self_attn.o_proj",
118
- "per_layer_input_gate",
119
- "16.self_attn.v_proj",
120
- "22.self_attn.v_proj",
121
- "23.mlp.up_proj",
122
- "30.self_attn.q_proj",
123
  "23.mlp.down_proj",
124
- "31.self_attn.v_proj",
125
- "18.self_attn.k_proj",
126
- "28.mlp.down_proj",
 
 
 
 
 
 
 
 
 
127
  "language_model.layers.10.mlp.down_proj",
128
- "33.self_attn.k_proj",
129
- "language_model.layers.7.self_attn.q_proj",
130
- "28.self_attn.q_proj",
131
  "language_model.layers.12.mlp.gate_proj",
132
- "29.mlp.up_proj",
133
- "language_model.layers.3.mlp.gate_proj",
134
  "30.mlp.up_proj",
135
- "language_model.layers.8.mlp.up_proj",
136
- "language_model.layers.13.self_attn.q_proj",
137
- "20.self_attn.v_proj",
138
- "language_model.layers.4.self_attn.k_proj",
139
- "26.mlp.up_proj",
140
- "34.self_attn.o_proj",
141
- "language_model.layers.13.mlp.down_proj",
142
- "19.self_attn.o_proj",
143
- "language_model.layers.8.mlp.down_proj",
144
- "21.mlp.down_proj",
145
- "language_model.layers.15.self_attn.k_proj",
146
- "language_model.layers.4.self_attn.v_proj",
147
- "language_model.layers.15.mlp.up_proj",
148
- "language_model.layers.3.self_attn.q_proj",
149
- "language_model.layers.2.mlp.up_proj",
150
- "28.self_attn.k_proj",
151
- "language_model.layers.3.self_attn.k_proj",
152
- "language_model.layers.10.self_attn.k_proj",
153
- "29.mlp.gate_proj",
154
- "language_model.layers.1.mlp.down_proj",
155
  "31.mlp.down_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "language_model.layers.14.self_attn.o_proj",
 
 
157
  "22.self_attn.o_proj",
 
 
 
 
 
 
 
158
  "language_model.layers.5.self_attn.o_proj",
159
- "language_model.layers.2.self_attn.v_proj",
160
- "27.self_attn.v_proj",
161
- "27.mlp.down_proj",
162
- "language_model.layers.7.mlp.down_proj",
163
- "language_model.layers.13.mlp.gate_proj",
164
- "language_model.layers.11.mlp.down_proj",
165
- "27.self_attn.o_proj",
166
- "language_model.layers.1.self_attn.q_proj",
167
- "output_proj",
168
- "language_model.layers.0.mlp.up_proj",
169
- "33.mlp.down_proj",
170
- "24.self_attn.q_proj",
171
- "18.mlp.up_proj",
172
  "17.self_attn.q_proj",
173
- "33.mlp.up_proj",
174
- "language_model.layers.13.self_attn.v_proj",
175
- "language_model.layers.10.self_attn.q_proj",
176
- "17.mlp.up_proj",
177
- "language_model.layers.5.self_attn.k_proj",
178
- "30.mlp.down_proj",
179
- "26.self_attn.k_proj",
180
- "language_model.layers.5.mlp.up_proj",
181
- "language_model.layers.13.self_attn.k_proj",
182
- "language_model.layers.11.self_attn.v_proj",
183
  "26.self_attn.o_proj",
184
- "21.mlp.gate_proj",
185
- "31.self_attn.k_proj",
186
- "language_model.layers.5.mlp.down_proj",
187
- "language_model.layers.6.mlp.up_proj",
188
  "language_model.layers.6.self_attn.v_proj",
189
- "22.mlp.down_proj",
190
- "language_model.layers.3.mlp.down_proj",
191
- "22.mlp.gate_proj",
192
- "24.mlp.gate_proj",
193
- "language_model.layers.14.mlp.up_proj",
194
- "language_model.layers.3.mlp.up_proj",
195
- "language_model.layers.0.self_attn.v_proj",
196
- "language_model.layers.6.mlp.gate_proj",
197
- "19.self_attn.q_proj",
198
- "19.mlp.up_proj",
199
- "language_model.layers.0.self_attn.q_proj",
200
- "24.mlp.down_proj",
201
- "25.self_attn.q_proj",
202
- "language_model.layers.8.self_attn.v_proj",
203
- "17.self_attn.o_proj",
204
- "20.mlp.up_proj",
205
- "32.mlp.up_proj",
206
- "16.mlp.gate_proj",
207
- "31.self_attn.o_proj",
208
- "25.mlp.down_proj",
209
- "language_model.layers.6.mlp.down_proj",
210
- "language_model.layers.10.self_attn.o_proj",
211
- "20.mlp.gate_proj",
212
- "25.mlp.up_proj",
213
- "33.self_attn.v_proj",
214
- "25.mlp.gate_proj",
215
- "language_model.layers.2.mlp.down_proj",
216
- "language_model.layers.14.mlp.gate_proj",
217
  "language_model.layers.8.self_attn.o_proj",
218
- "26.self_attn.q_proj",
 
 
 
 
 
 
 
 
 
219
  "32.self_attn.v_proj",
220
- "20.mlp.down_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  "language_model.layers.5.mlp.gate_proj",
222
- "21.self_attn.k_proj",
 
 
 
 
223
  "language_model.layers.11.self_attn.o_proj",
224
- "language_model.layers.5.self_attn.q_proj",
225
- "18.mlp.gate_proj",
226
- "17.mlp.gate_proj",
227
- "22.self_attn.q_proj",
228
- "23.self_attn.v_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  "per_layer_projection",
 
 
 
230
  "24.mlp.up_proj",
231
- "30.self_attn.v_proj",
232
- "language_model.layers.7.mlp.gate_proj",
233
- "language_model.layers.2.self_attn.k_proj",
234
- "32.self_attn.q_proj",
235
- "29.self_attn.o_proj",
236
- "21.self_attn.v_proj",
237
- "language_model.layers.9.mlp.up_proj",
238
- "language_model.layers.9.mlp.gate_proj",
239
  "language_model.layers.0.mlp.gate_proj",
240
- "language_model.layers.12.self_attn.k_proj",
241
- "29.self_attn.v_proj",
242
- "language_model.layers.0.mlp.down_proj",
243
- "language_model.layers.10.mlp.up_proj",
244
- "language_model.layers.14.self_attn.k_proj",
245
- "32.self_attn.k_proj",
246
- "language_model.layers.12.self_attn.o_proj",
247
- "31.self_attn.q_proj",
248
- "language_model.layers.4.mlp.gate_proj",
249
- "language_model.layers.15.self_attn.q_proj",
250
- "language_model.layers.1.mlp.up_proj",
251
- "language_model.layers.4.mlp.up_proj",
252
- "33.self_attn.o_proj",
253
- "23.self_attn.o_proj",
254
- "34.self_attn.q_proj",
255
- "language_model.layers.11.self_attn.q_proj",
256
- "34.mlp.up_proj",
257
- "language_model.layers.7.mlp.up_proj",
258
- "language_model.layers.1.self_attn.v_proj",
259
  "language_model.layers.2.self_attn.o_proj",
260
- "language_model.layers.15.mlp.down_proj",
261
- "30.mlp.gate_proj",
262
- "language_model.layers.14.self_attn.v_proj",
263
- "29.mlp.down_proj",
264
- "language_model.layers.1.mlp.gate_proj",
 
 
 
 
265
  "language_model.layers.3.self_attn.o_proj",
266
- "language_model.layers.12.self_attn.q_proj",
267
- "language_model.layers.7.self_attn.o_proj",
268
- "28.mlp.up_proj",
269
- "language_model.layers.8.self_attn.q_proj",
270
- "32.mlp.gate_proj",
271
- "20.self_attn.k_proj",
272
- "16.self_attn.q_proj",
273
- "25.self_attn.k_proj",
274
- "28.self_attn.v_proj",
275
- "32.mlp.down_proj",
276
- "language_model.layers.7.self_attn.k_proj",
277
- "language_model.layers.9.self_attn.k_proj",
278
- "27.mlp.gate_proj",
279
- "input_proj",
280
- "language_model.layers.13.mlp.up_proj",
281
- "language_model.layers.12.mlp.up_proj",
282
- "16.self_attn.o_proj",
283
  "language_model.layers.9.mlp.down_proj",
284
- "34.self_attn.v_proj",
285
- "language_model.layers.14.mlp.down_proj",
286
- "23.self_attn.k_proj",
287
- "17.self_attn.k_proj",
288
- "language_model.layers.4.self_attn.q_proj"
289
  ],
290
  "target_parameters": null,
291
  "task_type": "CAUSAL_LM",
 
32
  "rank_pattern": {},
33
  "revision": null,
34
  "target_modules": [
 
 
 
 
 
 
 
 
 
 
35
  "language_model.layers.8.mlp.gate_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  "language_model.layers.2.mlp.gate_proj",
37
+ "23.self_attn.k_proj",
38
+ "language_model.layers.15.self_attn.o_proj",
39
+ "34.mlp.up_proj",
40
+ "20.self_attn.v_proj",
41
+ "20.mlp.up_proj",
42
+ "32.mlp.down_proj",
43
+ "34.mlp.down_proj",
44
+ "language_model.layers.7.mlp.gate_proj",
45
+ "26.mlp.gate_proj",
46
+ "per_layer_input_gate",
47
+ "language_model.layers.1.self_attn.k_proj",
48
+ "language_model.layers.2.self_attn.v_proj",
49
+ "21.mlp.down_proj",
50
+ "language_model.layers.2.self_attn.k_proj",
51
+ "34.self_attn.q_proj",
52
+ "language_model.layers.3.mlp.up_proj",
53
+ "input_proj_linear",
54
+ "34.mlp.gate_proj",
55
+ "32.mlp.gate_proj",
56
+ "17.self_attn.o_proj",
57
+ "language_model.layers.14.mlp.up_proj",
58
+ "language_model.layers.7.self_attn.o_proj",
59
+ "language_model.layers.4.self_attn.q_proj",
60
+ "language_model.layers.3.self_attn.q_proj",
61
+ "language_model.layers.13.self_attn.v_proj",
62
+ "language_model.layers.11.mlp.down_proj",
63
+ "language_model.layers.13.mlp.down_proj",
64
+ "24.self_attn.q_proj",
65
+ "language_model.layers.12.self_attn.q_proj",
66
+ "22.mlp.up_proj",
67
+ "language_model.layers.8.mlp.down_proj",
68
+ "language_model.layers.3.self_attn.v_proj",
69
+ "language_model.layers.5.mlp.down_proj",
70
+ "language_model.layers.1.self_attn.v_proj",
71
  "20.self_attn.o_proj",
72
+ "language_model.layers.1.mlp.down_proj",
73
+ "22.mlp.gate_proj",
74
  "25.self_attn.o_proj",
75
+ "33.self_attn.k_proj",
76
+ "language_model.layers.10.self_attn.q_proj",
77
+ "33.self_attn.v_proj",
78
+ "17.mlp.down_proj",
79
+ "language_model.layers.4.self_attn.k_proj",
80
+ "input_proj",
81
+ "linear",
82
+ "language_model.layers.15.mlp.up_proj",
83
+ "16.mlp.up_proj",
84
  "18.self_attn.o_proj",
85
+ "33.mlp.down_proj",
86
+ "20.mlp.gate_proj",
 
 
 
 
 
 
 
 
 
 
 
87
  "language_model.layers.8.self_attn.k_proj",
88
+ "language_model.layers.5.self_attn.q_proj",
89
+ "language_model.layers.9.mlp.gate_proj",
90
+ "25.self_attn.q_proj",
91
+ "27.mlp.gate_proj",
92
+ "language_model.layers.12.self_attn.k_proj",
93
+ "17.self_attn.v_proj",
94
+ "language_model.layers.13.self_attn.q_proj",
95
+ "26.self_attn.v_proj",
96
+ "language_model.layers.3.self_attn.k_proj",
97
+ "32.self_attn.k_proj",
98
+ "language_model.layers.0.mlp.down_proj",
99
+ "24.mlp.down_proj",
100
+ "language_model.layers.15.self_attn.q_proj",
101
+ "language_model.layers.5.self_attn.k_proj",
102
+ "language_model.layers.1.self_attn.q_proj",
103
+ "19.self_attn.v_proj",
104
+ "26.mlp.down_proj",
105
  "27.self_attn.q_proj",
106
+ "language_model.layers.12.self_attn.v_proj",
107
+ "27.self_attn.v_proj",
108
+ "language_model.layers.11.self_attn.v_proj",
109
  "relative_k_proj",
110
+ "language_model.layers.6.mlp.down_proj",
111
+ "21.mlp.gate_proj",
112
+ "language_model.layers.4.mlp.gate_proj",
113
+ "23.self_attn.v_proj",
114
+ "language_model.layers.0.mlp.up_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  "language_model.layers.11.mlp.gate_proj",
116
+ "language_model.layers.11.self_attn.k_proj",
117
+ "21.mlp.up_proj",
118
+ "24.self_attn.o_proj",
119
+ "language_model.layers.15.mlp.down_proj",
 
 
 
 
 
 
120
  "23.mlp.down_proj",
121
+ "language_model.layers.4.self_attn.v_proj",
122
+ "24.self_attn.k_proj",
123
+ "24.mlp.gate_proj",
124
+ "27.mlp.down_proj",
125
+ "19.self_attn.q_proj",
126
+ "28.self_attn.o_proj",
127
+ "language_model.layers.7.mlp.up_proj",
128
+ "19.mlp.down_proj",
129
+ "30.self_attn.o_proj",
130
+ "language_model.layers.6.self_attn.o_proj",
131
+ "language_model.layers.1.mlp.up_proj",
132
+ "30.mlp.gate_proj",
133
  "language_model.layers.10.mlp.down_proj",
134
+ "22.mlp.down_proj",
 
 
135
  "language_model.layers.12.mlp.gate_proj",
136
+ "17.self_attn.k_proj",
 
137
  "30.mlp.up_proj",
138
+ "language_model.layers.10.self_attn.v_proj",
139
+ "22.self_attn.k_proj",
140
+ "30.self_attn.q_proj",
141
+ "language_model.layers.8.self_attn.q_proj",
142
+ "language_model.layers.15.mlp.gate_proj",
143
+ "language_model.layers.9.self_attn.q_proj",
144
+ "17.mlp.up_proj",
145
+ "language_model.layers.4.mlp.down_proj",
146
+ "language_model.layers.9.self_attn.k_proj",
147
+ "29.self_attn.q_proj",
148
+ "language_model.layers.15.self_attn.v_proj",
 
 
 
 
 
 
 
 
 
149
  "31.mlp.down_proj",
150
+ "17.mlp.gate_proj",
151
+ "33.mlp.gate_proj",
152
+ "language_model.layers.3.mlp.down_proj",
153
+ "language_model.layers.1.mlp.gate_proj",
154
+ "16.self_attn.q_proj",
155
+ "22.self_attn.q_proj",
156
+ "34.self_attn.v_proj",
157
+ "26.self_attn.k_proj",
158
+ "21.self_attn.q_proj",
159
+ "language_model.layers.13.mlp.up_proj",
160
+ "language_model.layers.8.self_attn.v_proj",
161
+ "18.self_attn.k_proj",
162
+ "31.self_attn.q_proj",
163
+ "32.mlp.up_proj",
164
  "language_model.layers.14.self_attn.o_proj",
165
+ "20.mlp.down_proj",
166
+ "16.mlp.gate_proj",
167
  "22.self_attn.o_proj",
168
+ "26.mlp.up_proj",
169
+ "language_model.layers.0.self_attn.v_proj",
170
+ "27.self_attn.k_proj",
171
+ "language_model.layers.14.self_attn.q_proj",
172
+ "29.mlp.down_proj",
173
+ "embedding_projection",
174
+ "24.self_attn.v_proj",
175
  "language_model.layers.5.self_attn.o_proj",
176
+ "language_model.layers.13.self_attn.o_proj",
177
+ "16.self_attn.o_proj",
 
 
 
 
 
 
 
 
 
 
 
178
  "17.self_attn.q_proj",
179
+ "language_model.layers.10.mlp.up_proj",
180
+ "28.mlp.up_proj",
181
+ "20.self_attn.q_proj",
182
+ "16.self_attn.k_proj",
183
+ "23.self_attn.q_proj",
 
 
 
 
 
184
  "26.self_attn.o_proj",
185
+ "26.self_attn.q_proj",
 
 
 
186
  "language_model.layers.6.self_attn.v_proj",
187
+ "18.mlp.gate_proj",
188
+ "23.self_attn.o_proj",
189
+ "language_model.layers.7.self_attn.v_proj",
190
+ "30.mlp.down_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  "language_model.layers.8.self_attn.o_proj",
192
+ "output_proj",
193
+ "29.self_attn.k_proj",
194
+ "language_model.layers.2.self_attn.q_proj",
195
+ "32.self_attn.q_proj",
196
+ "33.mlp.up_proj",
197
+ "20.self_attn.k_proj",
198
+ "language_model.layers.10.mlp.gate_proj",
199
+ "18.mlp.up_proj",
200
+ "language_model.layers.12.self_attn.o_proj",
201
+ "18.self_attn.v_proj",
202
  "32.self_attn.v_proj",
203
+ "29.self_attn.o_proj",
204
+ "language_model.layers.12.mlp.down_proj",
205
+ "language_model.layers.7.mlp.down_proj",
206
+ "language_model.layers.6.self_attn.q_proj",
207
+ "25.mlp.up_proj",
208
+ "language_model.layers.0.self_attn.o_proj",
209
+ "22.self_attn.v_proj",
210
+ "28.mlp.down_proj",
211
+ "30.self_attn.k_proj",
212
+ "34.self_attn.k_proj",
213
+ "32.self_attn.o_proj",
214
+ "33.self_attn.o_proj",
215
+ "29.self_attn.v_proj",
216
+ "27.mlp.up_proj",
217
+ "18.mlp.down_proj",
218
+ "language_model.layers.12.mlp.up_proj",
219
+ "27.self_attn.o_proj",
220
+ "language_model.layers.13.self_attn.k_proj",
221
+ "28.self_attn.k_proj",
222
  "language_model.layers.5.mlp.gate_proj",
223
+ "19.mlp.gate_proj",
224
+ "language_model.layers.15.self_attn.k_proj",
225
+ "25.self_attn.k_proj",
226
+ "language_model.layers.0.self_attn.k_proj",
227
+ "language_model.layers.10.self_attn.k_proj",
228
  "language_model.layers.11.self_attn.o_proj",
229
+ "31.mlp.up_proj",
230
+ "language_model.layers.14.self_attn.v_proj",
231
+ "language_model.layers.6.mlp.up_proj",
232
+ "25.mlp.down_proj",
233
+ "25.mlp.gate_proj",
234
+ "21.self_attn.o_proj",
235
+ "language_model.layers.5.self_attn.v_proj",
236
+ "language_model.layers.9.self_attn.v_proj",
237
+ "language_model.layers.2.mlp.up_proj",
238
+ "language_model.layers.6.mlp.gate_proj",
239
+ "29.mlp.gate_proj",
240
+ "language_model.layers.8.mlp.up_proj",
241
+ "language_model.layers.9.self_attn.o_proj",
242
+ "28.self_attn.q_proj",
243
+ "29.mlp.up_proj",
244
+ "25.self_attn.v_proj",
245
+ "31.self_attn.k_proj",
246
+ "language_model.layers.6.self_attn.k_proj",
247
+ "21.self_attn.v_proj",
248
+ "language_model.layers.11.self_attn.q_proj",
249
+ "language_model.layers.13.mlp.gate_proj",
250
+ "language_model.layers.4.self_attn.o_proj",
251
+ "language_model.layers.4.mlp.up_proj",
252
+ "28.self_attn.v_proj",
253
+ "31.self_attn.o_proj",
254
+ "31.mlp.gate_proj",
255
  "per_layer_projection",
256
+ "language_model.layers.7.self_attn.k_proj",
257
+ "language_model.layers.1.self_attn.o_proj",
258
+ "language_model.layers.7.self_attn.q_proj",
259
  "24.mlp.up_proj",
260
+ "19.self_attn.k_proj",
261
+ "language_model.layers.11.mlp.up_proj",
262
+ "language_model.layers.5.mlp.up_proj",
263
+ "16.self_attn.v_proj",
 
 
 
 
264
  "language_model.layers.0.mlp.gate_proj",
265
+ "23.mlp.gate_proj",
266
+ "language_model.layers.10.self_attn.o_proj",
267
+ "21.self_attn.k_proj",
268
+ "19.self_attn.o_proj",
269
+ "28.mlp.gate_proj",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  "language_model.layers.2.self_attn.o_proj",
271
+ "19.mlp.up_proj",
272
+ "language_model.layers.14.mlp.gate_proj",
273
+ "23.mlp.up_proj",
274
+ "30.self_attn.v_proj",
275
+ "language_model.layers.2.mlp.down_proj",
276
+ "language_model.layers.14.mlp.down_proj",
277
+ "per_layer_model_projection",
278
+ "18.self_attn.q_proj",
279
+ "language_model.layers.9.mlp.up_proj",
280
  "language_model.layers.3.self_attn.o_proj",
281
+ "31.self_attn.v_proj",
282
+ "language_model.layers.14.self_attn.k_proj",
283
+ "16.mlp.down_proj",
284
+ "33.self_attn.q_proj",
285
+ "language_model.layers.0.self_attn.q_proj",
286
+ "34.self_attn.o_proj",
 
 
 
 
 
 
 
 
 
 
 
287
  "language_model.layers.9.mlp.down_proj",
288
+ "language_model.layers.3.mlp.gate_proj"
 
 
 
 
289
  ],
290
  "target_parameters": null,
291
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3edb94b4b84823f252695d916ad2b80ef6e30f61306ae75aaee5fca5b7937cad
3
- size 1767188464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf22d900596a6eb57e4335b7294cfd8cc64069fe700b352c8efa60003b543e57
3
+ size 1688992024
runs/Apr06_04-45-01_d154214b03a0/events.out.tfevents.1775450701.d154214b03a0.3470.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acba9c2c9701a5664b938836f0633d4e865766f1072754040b599503a917b94c
3
+ size 13512
runs/Apr06_04-57-43_d154214b03a0/events.out.tfevents.1775451463.d154214b03a0.6207.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b116e3d5378e2705dfc340c483b9b266d9949ed0f311df72bae5e7a221d48e74
3
+ size 13943
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:17013debad8b537040d1b508a13691b08d40131f28cbf0e912832edb73108fdf
3
- size 5649
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6da60ec6db46a4d38e71b9a93c00a4827ea5baba1db83002f574e89ca570fb11
3
+ size 5713