u04ob20 commited on
Commit
82442bd
·
1 Parent(s): c22333b

model files

Browse files
Files changed (30) hide show
  1. google-gemma-2b-it/checkpoint-1083/config.json → config.json +0 -0
  2. google-gemma-2b-it/checkpoint-1083/generation_config.json → generation_config.json +0 -0
  3. google-gemma-2b-it/checkpoint-1083/model-00001-of-00003.safetensors +0 -3
  4. google-gemma-2b-it/checkpoint-1083/model-00002-of-00003.safetensors +0 -3
  5. google-gemma-2b-it/checkpoint-1083/model-00003-of-00003.safetensors +0 -3
  6. google-gemma-2b-it/checkpoint-1083/optimizer.pt +0 -3
  7. google-gemma-2b-it/checkpoint-1083/rng_state.pth +0 -3
  8. google-gemma-2b-it/checkpoint-1083/scheduler.pt +0 -3
  9. google-gemma-2b-it/checkpoint-1083/trainer_state.json +0 -306
  10. google-gemma-2b-it/checkpoint-114/config.json +0 -29
  11. google-gemma-2b-it/checkpoint-114/generation_config.json +0 -7
  12. google-gemma-2b-it/checkpoint-114/model-00001-of-00003.safetensors +0 -3
  13. google-gemma-2b-it/checkpoint-114/model-00002-of-00003.safetensors +0 -3
  14. google-gemma-2b-it/checkpoint-114/model-00003-of-00003.safetensors +0 -3
  15. google-gemma-2b-it/checkpoint-114/model.safetensors.index.json +0 -171
  16. google-gemma-2b-it/checkpoint-114/optimizer.pt +0 -3
  17. google-gemma-2b-it/checkpoint-114/rng_state.pth +0 -3
  18. google-gemma-2b-it/checkpoint-114/scheduler.pt +0 -3
  19. google-gemma-2b-it/checkpoint-114/trainer_state.json +0 -51
  20. google-gemma-2b-it/checkpoint-114/training_args.bin +0 -3
  21. google-gemma-2b-it/config.json +0 -29
  22. google-gemma-2b-it/generation_config.json +0 -7
  23. google-gemma-2b-it/model.safetensors.index.json +0 -171
  24. google-gemma-2b-it/training_args.bin +0 -3
  25. google-gemma-2b-it/model-00001-of-00003.safetensors → model-00001-of-00003.safetensors +0 -0
  26. google-gemma-2b-it/model-00002-of-00003.safetensors → model-00002-of-00003.safetensors +0 -0
  27. google-gemma-2b-it/model-00003-of-00003.safetensors → model-00003-of-00003.safetensors +0 -0
  28. google-gemma-2b-it/checkpoint-1083/model.safetensors.index.json → model.safetensors.index.json +0 -0
  29. google-gemma-2b-it/trainer_state.json → trainer_state.json +0 -0
  30. google-gemma-2b-it/checkpoint-1083/training_args.bin → training_args.bin +0 -0
google-gemma-2b-it/checkpoint-1083/config.json → config.json RENAMED
File without changes
google-gemma-2b-it/checkpoint-1083/generation_config.json → generation_config.json RENAMED
File without changes
google-gemma-2b-it/checkpoint-1083/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e928c3436befbeb60f11abfa38cc7c4e06dff86be0b2b5cd8c2be6d616b9f266
3
- size 4911635192
 
 
 
 
google-gemma-2b-it/checkpoint-1083/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:87261d1c9af2516c48938cf057abcffe74222d9bc7e28a9fdeb08167ecf87b52
3
- size 4978830584
 
 
 
 
google-gemma-2b-it/checkpoint-1083/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2fd4d7255f50cb5ce7c863a0a0de43f2400af9ffdddad415971338951baf25e0
3
- size 134242760
 
 
 
 
google-gemma-2b-it/checkpoint-1083/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cfd8a6ff376aa33aee510e36ac3845da45d2f0382fe419028c8338af5b667b3
3
- size 20049522477
 
 
 
 
google-gemma-2b-it/checkpoint-1083/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a2d2d1561481739972beb9da4cda24cd79618e56b2904d90829eda87023314e
3
- size 14575
 
 
 
 
google-gemma-2b-it/checkpoint-1083/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:14b00f4b4419499040f80c52731f61cf1e3cb06f8174abd72edb3fd6e413cc68
3
- size 627
 
 
 
 
google-gemma-2b-it/checkpoint-1083/trainer_state.json DELETED
@@ -1,306 +0,0 @@
1
- {
2
- "best_metric": 2.030155658721924,
3
- "best_model_checkpoint": "/uoa/scratch/users/u04ob20/attrib/data/models/google-gemma-2b-it/checkpoint-114",
4
- "epoch": 9.584070796460177,
5
- "eval_steps": 57,
6
- "global_step": 1083,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.5,
13
- "grad_norm": 17.672748565673828,
14
- "learning_rate": 4.747787610619469e-05,
15
- "loss": 2.9683,
16
- "step": 57
17
- },
18
- {
19
- "epoch": 0.5,
20
- "eval_loss": 2.1323869228363037,
21
- "eval_runtime": 14.1951,
22
- "eval_samples_per_second": 15.639,
23
- "eval_steps_per_second": 0.986,
24
- "step": 57
25
- },
26
- {
27
- "epoch": 1.01,
28
- "grad_norm": 10.006913185119629,
29
- "learning_rate": 4.495575221238939e-05,
30
- "loss": 1.9812,
31
- "step": 114
32
- },
33
- {
34
- "epoch": 1.01,
35
- "eval_loss": 2.030155658721924,
36
- "eval_runtime": 14.0967,
37
- "eval_samples_per_second": 15.748,
38
- "eval_steps_per_second": 0.993,
39
- "step": 114
40
- },
41
- {
42
- "epoch": 1.51,
43
- "grad_norm": 7.1079607009887695,
44
- "learning_rate": 4.243362831858407e-05,
45
- "loss": 1.6206,
46
- "step": 171
47
- },
48
- {
49
- "epoch": 1.51,
50
- "eval_loss": 2.0554354190826416,
51
- "eval_runtime": 14.0805,
52
- "eval_samples_per_second": 15.766,
53
- "eval_steps_per_second": 0.994,
54
- "step": 171
55
- },
56
- {
57
- "epoch": 2.02,
58
- "grad_norm": 8.306120872497559,
59
- "learning_rate": 3.991150442477876e-05,
60
- "loss": 1.6558,
61
- "step": 228
62
- },
63
- {
64
- "epoch": 2.02,
65
- "eval_loss": 2.124750852584839,
66
- "eval_runtime": 14.181,
67
- "eval_samples_per_second": 15.655,
68
- "eval_steps_per_second": 0.987,
69
- "step": 228
70
- },
71
- {
72
- "epoch": 2.52,
73
- "grad_norm": 5.911388874053955,
74
- "learning_rate": 3.7389380530973455e-05,
75
- "loss": 1.0749,
76
- "step": 285
77
- },
78
- {
79
- "epoch": 2.52,
80
- "eval_loss": 2.2880985736846924,
81
- "eval_runtime": 14.046,
82
- "eval_samples_per_second": 15.805,
83
- "eval_steps_per_second": 0.997,
84
- "step": 285
85
- },
86
- {
87
- "epoch": 3.03,
88
- "grad_norm": 5.9703569412231445,
89
- "learning_rate": 3.4867256637168145e-05,
90
- "loss": 1.1099,
91
- "step": 342
92
- },
93
- {
94
- "epoch": 3.03,
95
- "eval_loss": 2.6607654094696045,
96
- "eval_runtime": 14.1568,
97
- "eval_samples_per_second": 15.681,
98
- "eval_steps_per_second": 0.989,
99
- "step": 342
100
- },
101
- {
102
- "epoch": 3.53,
103
- "grad_norm": 7.199086666107178,
104
- "learning_rate": 3.2345132743362834e-05,
105
- "loss": 0.5349,
106
- "step": 399
107
- },
108
- {
109
- "epoch": 3.53,
110
- "eval_loss": 2.9111106395721436,
111
- "eval_runtime": 14.1631,
112
- "eval_samples_per_second": 15.675,
113
- "eval_steps_per_second": 0.988,
114
- "step": 399
115
- },
116
- {
117
- "epoch": 4.04,
118
- "grad_norm": 3.1459338665008545,
119
- "learning_rate": 2.982300884955752e-05,
120
- "loss": 0.5432,
121
- "step": 456
122
- },
123
- {
124
- "epoch": 4.04,
125
- "eval_loss": 3.114436149597168,
126
- "eval_runtime": 14.1302,
127
- "eval_samples_per_second": 15.711,
128
- "eval_steps_per_second": 0.991,
129
- "step": 456
130
- },
131
- {
132
- "epoch": 4.54,
133
- "grad_norm": 3.5249204635620117,
134
- "learning_rate": 2.7300884955752216e-05,
135
- "loss": 0.2523,
136
- "step": 513
137
- },
138
- {
139
- "epoch": 4.54,
140
- "eval_loss": 3.34505033493042,
141
- "eval_runtime": 14.1883,
142
- "eval_samples_per_second": 15.647,
143
- "eval_steps_per_second": 0.987,
144
- "step": 513
145
- },
146
- {
147
- "epoch": 5.04,
148
- "grad_norm": 3.153855085372925,
149
- "learning_rate": 2.4778761061946905e-05,
150
- "loss": 0.2561,
151
- "step": 570
152
- },
153
- {
154
- "epoch": 5.04,
155
- "eval_loss": 3.5140204429626465,
156
- "eval_runtime": 14.128,
157
- "eval_samples_per_second": 15.714,
158
- "eval_steps_per_second": 0.991,
159
- "step": 570
160
- },
161
- {
162
- "epoch": 5.55,
163
- "grad_norm": 3.072230339050293,
164
- "learning_rate": 2.2256637168141594e-05,
165
- "loss": 0.1508,
166
- "step": 627
167
- },
168
- {
169
- "epoch": 5.55,
170
- "eval_loss": 3.5723717212677,
171
- "eval_runtime": 14.0396,
172
- "eval_samples_per_second": 15.812,
173
- "eval_steps_per_second": 0.997,
174
- "step": 627
175
- },
176
- {
177
- "epoch": 6.05,
178
- "grad_norm": 1.96257746219635,
179
- "learning_rate": 1.9734513274336283e-05,
180
- "loss": 0.1365,
181
- "step": 684
182
- },
183
- {
184
- "epoch": 6.05,
185
- "eval_loss": 3.7443270683288574,
186
- "eval_runtime": 14.1133,
187
- "eval_samples_per_second": 15.73,
188
- "eval_steps_per_second": 0.992,
189
- "step": 684
190
- },
191
- {
192
- "epoch": 6.56,
193
- "grad_norm": 2.537320375442505,
194
- "learning_rate": 1.7212389380530976e-05,
195
- "loss": 0.0878,
196
- "step": 741
197
- },
198
- {
199
- "epoch": 6.56,
200
- "eval_loss": 3.926490545272827,
201
- "eval_runtime": 14.1716,
202
- "eval_samples_per_second": 15.665,
203
- "eval_steps_per_second": 0.988,
204
- "step": 741
205
- },
206
- {
207
- "epoch": 7.06,
208
- "grad_norm": 1.6090797185897827,
209
- "learning_rate": 1.4690265486725665e-05,
210
- "loss": 0.0841,
211
- "step": 798
212
- },
213
- {
214
- "epoch": 7.06,
215
- "eval_loss": 3.97700572013855,
216
- "eval_runtime": 14.1937,
217
- "eval_samples_per_second": 15.641,
218
- "eval_steps_per_second": 0.986,
219
- "step": 798
220
- },
221
- {
222
- "epoch": 7.57,
223
- "grad_norm": 1.7380380630493164,
224
- "learning_rate": 1.2168141592920354e-05,
225
- "loss": 0.0587,
226
- "step": 855
227
- },
228
- {
229
- "epoch": 7.57,
230
- "eval_loss": 4.071342468261719,
231
- "eval_runtime": 14.1098,
232
- "eval_samples_per_second": 15.734,
233
- "eval_steps_per_second": 0.992,
234
- "step": 855
235
- },
236
- {
237
- "epoch": 8.07,
238
- "grad_norm": 0.9195989370346069,
239
- "learning_rate": 9.646017699115045e-06,
240
- "loss": 0.0539,
241
- "step": 912
242
- },
243
- {
244
- "epoch": 8.07,
245
- "eval_loss": 4.22251558303833,
246
- "eval_runtime": 14.0932,
247
- "eval_samples_per_second": 15.752,
248
- "eval_steps_per_second": 0.993,
249
- "step": 912
250
- },
251
- {
252
- "epoch": 8.58,
253
- "grad_norm": 1.6740847826004028,
254
- "learning_rate": 7.123893805309735e-06,
255
- "loss": 0.0404,
256
- "step": 969
257
- },
258
- {
259
- "epoch": 8.58,
260
- "eval_loss": 4.388303279876709,
261
- "eval_runtime": 14.0715,
262
- "eval_samples_per_second": 15.777,
263
- "eval_steps_per_second": 0.995,
264
- "step": 969
265
- },
266
- {
267
- "epoch": 9.08,
268
- "grad_norm": 0.768718421459198,
269
- "learning_rate": 4.601769911504425e-06,
270
- "loss": 0.0383,
271
- "step": 1026
272
- },
273
- {
274
- "epoch": 9.08,
275
- "eval_loss": 4.46160364151001,
276
- "eval_runtime": 14.1199,
277
- "eval_samples_per_second": 15.722,
278
- "eval_steps_per_second": 0.992,
279
- "step": 1026
280
- },
281
- {
282
- "epoch": 9.58,
283
- "grad_norm": 0.8811420202255249,
284
- "learning_rate": 2.079646017699115e-06,
285
- "loss": 0.0292,
286
- "step": 1083
287
- },
288
- {
289
- "epoch": 9.58,
290
- "eval_loss": 4.561453819274902,
291
- "eval_runtime": 14.1759,
292
- "eval_samples_per_second": 15.66,
293
- "eval_steps_per_second": 0.988,
294
- "step": 1083
295
- }
296
- ],
297
- "logging_steps": 57,
298
- "max_steps": 1130,
299
- "num_input_tokens_seen": 0,
300
- "num_train_epochs": 10,
301
- "save_steps": 57,
302
- "total_flos": 2.305048903365427e+16,
303
- "train_batch_size": 14,
304
- "trial_name": null,
305
- "trial_params": null
306
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
google-gemma-2b-it/checkpoint-114/config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "_name_or_path": "google/gemma-2b-it",
3
- "architectures": [
4
- "GemmaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 2,
9
- "eos_token_id": 1,
10
- "head_dim": 256,
11
- "hidden_act": "gelu",
12
- "hidden_activation": null,
13
- "hidden_size": 2048,
14
- "initializer_range": 0.02,
15
- "intermediate_size": 16384,
16
- "max_position_embeddings": 8192,
17
- "model_type": "gemma",
18
- "num_attention_heads": 8,
19
- "num_hidden_layers": 18,
20
- "num_key_value_heads": 1,
21
- "pad_token_id": 0,
22
- "rms_norm_eps": 1e-06,
23
- "rope_scaling": null,
24
- "rope_theta": 10000.0,
25
- "torch_dtype": "float32",
26
- "transformers_version": "4.39.2",
27
- "use_cache": true,
28
- "vocab_size": 256000
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
google-gemma-2b-it/checkpoint-114/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 2,
4
- "eos_token_id": 1,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.39.2"
7
- }
 
 
 
 
 
 
 
 
google-gemma-2b-it/checkpoint-114/model-00001-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:91525f0c6a5c3e87dace4986c34f4d0e57786a9fbca7d38ac97a824a788c5a58
3
- size 4911635192
 
 
 
 
google-gemma-2b-it/checkpoint-114/model-00002-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5b9d643a01749d1f9729f5ae73c8596e775cc94cefde63029a9f5db7ff294ec
3
- size 4978830584
 
 
 
 
google-gemma-2b-it/checkpoint-114/model-00003-of-00003.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6c8da4871411c8f0e88d67a68e5214333c72dc54a40eb2b06950f2c61e94446
3
- size 134242760
 
 
 
 
google-gemma-2b-it/checkpoint-114/model.safetensors.index.json DELETED
@@ -1,171 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 10024689664
4
- },
5
- "weight_map": {
6
- "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
- "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
8
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
9
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
10
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
11
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
12
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
13
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
14
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
15
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
16
- "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
17
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
18
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
19
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
20
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
21
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
22
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
23
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
24
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
25
- "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
26
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
27
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
28
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
29
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
30
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
31
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
32
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
33
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
34
- "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
35
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
36
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
37
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
38
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
39
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
40
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
41
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
42
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
43
- "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
44
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
45
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
46
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
47
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
48
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
49
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
50
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
51
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
52
- "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
53
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
54
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
55
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
56
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
57
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
58
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
59
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
60
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
61
- "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
62
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
63
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
64
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
65
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
66
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
67
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
68
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
69
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
70
- "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
71
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
72
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
73
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
74
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
75
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
76
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
77
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
78
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
79
- "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
80
- "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
81
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
82
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
83
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
84
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
85
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
86
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
87
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
88
- "model.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors",
89
- "model.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
90
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
91
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
92
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
93
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
94
- "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
95
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
96
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
97
- "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
98
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
99
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
100
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
101
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
102
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
103
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
104
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
105
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
106
- "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
107
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
108
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
109
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
110
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
111
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
112
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
113
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
114
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
115
- "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
116
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
117
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
118
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
119
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
120
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
121
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
122
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
123
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
124
- "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
125
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
126
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
127
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
128
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
129
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
130
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
131
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
132
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
133
- "model.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors",
134
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
135
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
136
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
137
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
138
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
139
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
140
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
141
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
142
- "model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors",
143
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
144
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
145
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
146
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
147
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
148
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
149
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
150
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
151
- "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors",
152
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
153
- "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
154
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
155
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
156
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
157
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
158
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
159
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
160
- "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors",
161
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
162
- "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
163
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
164
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
165
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
166
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
167
- "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
168
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
169
- "model.norm.weight": "model-00003-of-00003.safetensors"
170
- }
171
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
google-gemma-2b-it/checkpoint-114/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:606209d3e689dcab96c72d8e8176e13522a9606daf7d5592cd31e9ea8fc74570
3
- size 20049522477
 
 
 
 
google-gemma-2b-it/checkpoint-114/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:972139d83957a9cf2600cb6eeca17287d7a5377c33a53500ae7e13fe830ad36b
3
- size 14575
 
 
 
 
google-gemma-2b-it/checkpoint-114/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:872cb3f6d8d4af7e8967d0b2c19cdfdee44e91913a41f93f345fcf22cc6842e9
3
- size 627
 
 
 
 
google-gemma-2b-it/checkpoint-114/trainer_state.json DELETED
@@ -1,51 +0,0 @@
1
- {
2
- "best_metric": 2.030155658721924,
3
- "best_model_checkpoint": "/uoa/scratch/users/u04ob20/attrib/data/models/google-gemma-2b-it/checkpoint-114",
4
- "epoch": 1.008849557522124,
5
- "eval_steps": 57,
6
- "global_step": 114,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.5,
13
- "grad_norm": 17.672748565673828,
14
- "learning_rate": 4.747787610619469e-05,
15
- "loss": 2.9683,
16
- "step": 57
17
- },
18
- {
19
- "epoch": 0.5,
20
- "eval_loss": 2.1323869228363037,
21
- "eval_runtime": 14.1951,
22
- "eval_samples_per_second": 15.639,
23
- "eval_steps_per_second": 0.986,
24
- "step": 57
25
- },
26
- {
27
- "epoch": 1.01,
28
- "grad_norm": 10.006913185119629,
29
- "learning_rate": 4.495575221238939e-05,
30
- "loss": 1.9812,
31
- "step": 114
32
- },
33
- {
34
- "epoch": 1.01,
35
- "eval_loss": 2.030155658721924,
36
- "eval_runtime": 14.0967,
37
- "eval_samples_per_second": 15.748,
38
- "eval_steps_per_second": 0.993,
39
- "step": 114
40
- }
41
- ],
42
- "logging_steps": 57,
43
- "max_steps": 1130,
44
- "num_input_tokens_seen": 0,
45
- "num_train_epochs": 10,
46
- "save_steps": 57,
47
- "total_flos": 2426207046991872.0,
48
- "train_batch_size": 14,
49
- "trial_name": null,
50
- "trial_params": null
51
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
google-gemma-2b-it/checkpoint-114/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7ea7db924a891dab8a05e67c6c15139ab02e8d8e2079f27e7342cf8878db67c
3
- size 4475
 
 
 
 
google-gemma-2b-it/config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "_name_or_path": "google/gemma-2b-it",
3
- "architectures": [
4
- "GemmaForCausalLM"
5
- ],
6
- "attention_bias": false,
7
- "attention_dropout": 0.0,
8
- "bos_token_id": 2,
9
- "eos_token_id": 1,
10
- "head_dim": 256,
11
- "hidden_act": "gelu",
12
- "hidden_activation": null,
13
- "hidden_size": 2048,
14
- "initializer_range": 0.02,
15
- "intermediate_size": 16384,
16
- "max_position_embeddings": 8192,
17
- "model_type": "gemma",
18
- "num_attention_heads": 8,
19
- "num_hidden_layers": 18,
20
- "num_key_value_heads": 1,
21
- "pad_token_id": 0,
22
- "rms_norm_eps": 1e-06,
23
- "rope_scaling": null,
24
- "rope_theta": 10000.0,
25
- "torch_dtype": "float32",
26
- "transformers_version": "4.39.2",
27
- "use_cache": true,
28
- "vocab_size": 256000
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
google-gemma-2b-it/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 2,
4
- "eos_token_id": 1,
5
- "pad_token_id": 0,
6
- "transformers_version": "4.39.2"
7
- }
 
 
 
 
 
 
 
 
google-gemma-2b-it/model.safetensors.index.json DELETED
@@ -1,171 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 10024689664
4
- },
5
- "weight_map": {
6
- "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
7
- "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
8
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
9
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
10
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
11
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
12
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
13
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
14
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
15
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
16
- "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
17
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
18
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
19
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
20
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
21
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
22
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
23
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
24
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
25
- "model.layers.10.input_layernorm.weight": "model-00002-of-00003.safetensors",
26
- "model.layers.10.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
27
- "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
28
- "model.layers.10.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
29
- "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
30
- "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
31
- "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
32
- "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
33
- "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
34
- "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
35
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
36
- "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
37
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
38
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
39
- "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
40
- "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
41
- "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
42
- "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
43
- "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
44
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
45
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
46
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
47
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
48
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
49
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
50
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
51
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
52
- "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
53
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
54
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
55
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
56
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
57
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
58
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
59
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
60
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
61
- "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
62
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
63
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
64
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
65
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
66
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
67
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
68
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
69
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
70
- "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
71
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
72
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
73
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
74
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
75
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
76
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
77
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
78
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
79
- "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
80
- "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
81
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
82
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
83
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
84
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
85
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
86
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
87
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
88
- "model.layers.17.input_layernorm.weight": "model-00003-of-00003.safetensors",
89
- "model.layers.17.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
90
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
91
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
92
- "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
93
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
94
- "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
95
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
96
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
97
- "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
98
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
99
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
100
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
101
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
102
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
103
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
104
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
105
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
106
- "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
107
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
108
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
109
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
110
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
111
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
112
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
113
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
114
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
115
- "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
116
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
117
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
118
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
119
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
120
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
121
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
122
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
123
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
124
- "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
125
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
126
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
127
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
128
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
129
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
130
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
131
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
132
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
133
- "model.layers.6.input_layernorm.weight": "model-00002-of-00003.safetensors",
134
- "model.layers.6.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
135
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
136
- "model.layers.6.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
137
- "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
138
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
139
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
140
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
141
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
142
- "model.layers.7.input_layernorm.weight": "model-00002-of-00003.safetensors",
143
- "model.layers.7.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
144
- "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
145
- "model.layers.7.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
146
- "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
147
- "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
148
- "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
149
- "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
150
- "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
151
- "model.layers.8.input_layernorm.weight": "model-00002-of-00003.safetensors",
152
- "model.layers.8.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
153
- "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
154
- "model.layers.8.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
155
- "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
156
- "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
157
- "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
158
- "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
159
- "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
160
- "model.layers.9.input_layernorm.weight": "model-00002-of-00003.safetensors",
161
- "model.layers.9.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
162
- "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
163
- "model.layers.9.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
164
- "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
165
- "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
166
- "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
167
- "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
168
- "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
169
- "model.norm.weight": "model-00003-of-00003.safetensors"
170
- }
171
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
google-gemma-2b-it/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d7ea7db924a891dab8a05e67c6c15139ab02e8d8e2079f27e7342cf8878db67c
3
- size 4475
 
 
 
 
google-gemma-2b-it/model-00001-of-00003.safetensors → model-00001-of-00003.safetensors RENAMED
File without changes
google-gemma-2b-it/model-00002-of-00003.safetensors → model-00002-of-00003.safetensors RENAMED
File without changes
google-gemma-2b-it/model-00003-of-00003.safetensors → model-00003-of-00003.safetensors RENAMED
File without changes
google-gemma-2b-it/checkpoint-1083/model.safetensors.index.json → model.safetensors.index.json RENAMED
File without changes
google-gemma-2b-it/trainer_state.json → trainer_state.json RENAMED
File without changes
google-gemma-2b-it/checkpoint-1083/training_args.bin → training_args.bin RENAMED
File without changes