loris3 commited on
Commit
4ef16d0
·
verified ·
1 Parent(s): 695d4d1

Upload folder using huggingface_hub

Browse files
checkpoints/checkpoint-2126/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 1,
8
+ "eos_token_id": 2,
9
+ "head_dim": 64,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 2048,
14
+ "max_position_embeddings": 256,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "num_key_value_heads": 12,
20
+ "pad_token_id": 1,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-06,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": true,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.47.0",
28
+ "use_cache": true,
29
+ "vocab_size": 16000
30
+ }
checkpoints/checkpoint-2126/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "pad_token_id": 1,
6
+ "transformers_version": "4.47.0"
7
+ }
checkpoints/checkpoint-2126/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bef1ce5ddc8ac0e10f8c70cb68970a83e1098e2f6d2a4a3d9acd6330c40aae60
3
+ size 388979624
checkpoints/checkpoint-2126/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53432d77cdf2f82d29db4b7d5cb9ee2cafc8c8ee526b6ff0ddd446b488affdb0
3
+ size 778027770
checkpoints/checkpoint-2126/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab4163c79d02a2b1cd5e87832e03656d66b051535f06b7e0f0faade52cd7bd7
3
+ size 14244
checkpoints/checkpoint-2126/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:de4c480b1ad91c9a646c0f73668a7c419e355f528f9b6b3828c3775ea342d430
3
+ size 1064
checkpoints/checkpoint-2126/trainer_state.json ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.847590099869735,
5
+ "eval_steps": 500,
6
+ "global_step": 2126,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08684324793747286,
13
+ "grad_norm": 31.347537994384766,
14
+ "learning_rate": 0.00011666666666666667,
15
+ "loss": 94.1908,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.17368649587494572,
20
+ "grad_norm": 17.979413986206055,
21
+ "learning_rate": 0.00023333333333333333,
22
+ "loss": 56.4907,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.26052974381241856,
27
+ "grad_norm": 8.942031860351562,
28
+ "learning_rate": 0.00035,
29
+ "loss": 48.4783,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.34737299174989145,
34
+ "grad_norm": 6.502614974975586,
35
+ "learning_rate": 0.00046666666666666666,
36
+ "loss": 45.3011,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.4342162396873643,
41
+ "grad_norm": 6.777210235595703,
42
+ "learning_rate": 0.0005833333333333334,
43
+ "loss": 43.6152,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.5210594876248371,
48
+ "grad_norm": 6.147511959075928,
49
+ "learning_rate": 0.0007,
50
+ "loss": 42.3483,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.60790273556231,
55
+ "grad_norm": 5.621304988861084,
56
+ "learning_rate": 0.0006998546367133479,
57
+ "loss": 41.3118,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.6947459834997829,
62
+ "grad_norm": 5.471296787261963,
63
+ "learning_rate": 0.0006994186675990208,
64
+ "loss": 40.7393,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 0.7815892314372558,
69
+ "grad_norm": 7.381715774536133,
70
+ "learning_rate": 0.0006986924547936092,
71
+ "loss": 40.1379,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 0.8684324793747286,
76
+ "grad_norm": 4.7940192222595215,
77
+ "learning_rate": 0.000697676601523857,
78
+ "loss": 39.8433,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 0.9552757273122015,
83
+ "grad_norm": 4.409699440002441,
84
+ "learning_rate": 0.0006963719516055934,
85
+ "loss": 39.5783,
86
+ "step": 550
87
+ },
88
+ {
89
+ "epoch": 0.998697351280938,
90
+ "eval_accuracy": 0.0,
91
+ "eval_loss": 5.713693618774414,
92
+ "eval_normalizer": 966112.0,
93
+ "eval_runtime": 115.6974,
94
+ "eval_samples_per_second": 509.519,
95
+ "eval_steps_per_second": 1.003,
96
+ "step": 575
97
+ },
98
+ {
99
+ "epoch": 1.0434216239687364,
100
+ "grad_norm": 4.693251609802246,
101
+ "learning_rate": 0.0006947795887428181,
102
+ "loss": 38.3627,
103
+ "step": 600
104
+ },
105
+ {
106
+ "epoch": 1.1302648719062094,
107
+ "grad_norm": 5.077625274658203,
108
+ "learning_rate": 0.0006929008356275276,
109
+ "loss": 37.544,
110
+ "step": 650
111
+ },
112
+ {
113
+ "epoch": 1.2171081198436822,
114
+ "grad_norm": 4.627607345581055,
115
+ "learning_rate": 0.0006907372528410224,
116
+ "loss": 37.5222,
117
+ "step": 700
118
+ },
119
+ {
120
+ "epoch": 1.303951367781155,
121
+ "grad_norm": 4.705991744995117,
122
+ "learning_rate": 0.0006882906375576155,
123
+ "loss": 36.4945,
124
+ "step": 750
125
+ },
126
+ {
127
+ "epoch": 1.390794615718628,
128
+ "grad_norm": 4.745748996734619,
129
+ "learning_rate": 0.0006855630220518143,
130
+ "loss": 36.3402,
131
+ "step": 800
132
+ },
133
+ {
134
+ "epoch": 1.4776378636561007,
135
+ "grad_norm": 4.541324138641357,
136
+ "learning_rate": 0.0006825566720102167,
137
+ "loss": 36.423,
138
+ "step": 850
139
+ },
140
+ {
141
+ "epoch": 1.5644811115935737,
142
+ "grad_norm": 4.555329322814941,
143
+ "learning_rate": 0.0006792740846495249,
144
+ "loss": 36.4842,
145
+ "step": 900
146
+ },
147
+ {
148
+ "epoch": 1.6513243595310465,
149
+ "grad_norm": 4.505599021911621,
150
+ "learning_rate": 0.0006757179866422389,
151
+ "loss": 36.5019,
152
+ "step": 950
153
+ },
154
+ {
155
+ "epoch": 1.7381676074685193,
156
+ "grad_norm": 4.661733627319336,
157
+ "learning_rate": 0.0006718913318517527,
158
+ "loss": 36.0491,
159
+ "step": 1000
160
+ },
161
+ {
162
+ "epoch": 1.825010855405992,
163
+ "grad_norm": 4.7356438636779785,
164
+ "learning_rate": 0.0006677972988787362,
165
+ "loss": 33.5704,
166
+ "step": 1050
167
+ },
168
+ {
169
+ "epoch": 1.911854103343465,
170
+ "grad_norm": 4.665255069732666,
171
+ "learning_rate": 0.0006634392884208387,
172
+ "loss": 33.8845,
173
+ "step": 1100
174
+ },
175
+ {
176
+ "epoch": 1.998697351280938,
177
+ "grad_norm": 5.151551246643066,
178
+ "learning_rate": 0.0006588209204479085,
179
+ "loss": 34.1235,
180
+ "step": 1150
181
+ },
182
+ {
183
+ "epoch": 1.998697351280938,
184
+ "eval_accuracy": 0.0,
185
+ "eval_loss": 5.946074962615967,
186
+ "eval_normalizer": 966112.0,
187
+ "eval_runtime": 115.8973,
188
+ "eval_samples_per_second": 508.64,
189
+ "eval_steps_per_second": 1.001,
190
+ "step": 1150
191
+ },
192
+ {
193
+ "epoch": 2.086843247937473,
194
+ "grad_norm": 7.160614490509033,
195
+ "learning_rate": 0.0006539460311950741,
196
+ "loss": 75.2515,
197
+ "step": 1200
198
+ },
199
+ {
200
+ "epoch": 2.1736864958749456,
201
+ "grad_norm": 6.339908123016357,
202
+ "learning_rate": 0.000648818669976186,
203
+ "loss": 63.1052,
204
+ "step": 1250
205
+ },
206
+ {
207
+ "epoch": 2.260529743812419,
208
+ "grad_norm": 6.869708061218262,
209
+ "learning_rate": 0.0006434430958202652,
210
+ "loss": 55.7262,
211
+ "step": 1300
212
+ },
213
+ {
214
+ "epoch": 2.3473729917498916,
215
+ "grad_norm": 8.690558433532715,
216
+ "learning_rate": 0.0006378237739337511,
217
+ "loss": 46.9368,
218
+ "step": 1350
219
+ },
220
+ {
221
+ "epoch": 2.4342162396873643,
222
+ "grad_norm": 10.8461275100708,
223
+ "learning_rate": 0.0006319653719914907,
224
+ "loss": 36.8508,
225
+ "step": 1400
226
+ },
227
+ {
228
+ "epoch": 2.521059487624837,
229
+ "grad_norm": 11.691084861755371,
230
+ "learning_rate": 0.000625872756259546,
231
+ "loss": 27.586,
232
+ "step": 1450
233
+ },
234
+ {
235
+ "epoch": 2.60790273556231,
236
+ "grad_norm": 11.534943580627441,
237
+ "learning_rate": 0.0006195509875530431,
238
+ "loss": 20.8625,
239
+ "step": 1500
240
+ },
241
+ {
242
+ "epoch": 2.694745983499783,
243
+ "grad_norm": 10.615392684936523,
244
+ "learning_rate": 0.0006130053170324202,
245
+ "loss": 16.9027,
246
+ "step": 1550
247
+ },
248
+ {
249
+ "epoch": 2.781589231437256,
250
+ "grad_norm": 9.750032424926758,
251
+ "learning_rate": 0.000606241181841564,
252
+ "loss": 14.6911,
253
+ "step": 1600
254
+ },
255
+ {
256
+ "epoch": 2.847590099869735,
257
+ "eval_accuracy": 0.0,
258
+ "eval_loss": 7.1964263916015625,
259
+ "eval_normalizer": 966112.0,
260
+ "eval_runtime": 115.2002,
261
+ "eval_samples_per_second": 511.718,
262
+ "eval_steps_per_second": 1.007,
263
+ "step": 1638
264
+ },
265
+ {
266
+ "epoch": 3.0208423795049937,
267
+ "grad_norm": 9.399425506591797,
268
+ "learning_rate": 0.0005992642005914615,
269
+ "loss": 13.6775,
270
+ "step": 1650
271
+ },
272
+ {
273
+ "epoch": 3.1076856274424665,
274
+ "grad_norm": 8.75631046295166,
275
+ "learning_rate": 0.0005920801686931151,
276
+ "loss": 12.8369,
277
+ "step": 1700
278
+ },
279
+ {
280
+ "epoch": 3.1945288753799392,
281
+ "grad_norm": 8.309281349182129,
282
+ "learning_rate": 0.0005846950535436001,
283
+ "loss": 12.3939,
284
+ "step": 1750
285
+ },
286
+ {
287
+ "epoch": 3.281372123317412,
288
+ "grad_norm": 7.948273658752441,
289
+ "learning_rate": 0.0005771149895692616,
290
+ "loss": 12.1119,
291
+ "step": 1800
292
+ },
293
+ {
294
+ "epoch": 3.368215371254885,
295
+ "grad_norm": 7.4247727394104,
296
+ "learning_rate": 0.0005693462731301704,
297
+ "loss": 11.759,
298
+ "step": 1850
299
+ },
300
+ {
301
+ "epoch": 3.455058619192358,
302
+ "grad_norm": 7.026332378387451,
303
+ "learning_rate": 0.0005613953572900671,
304
+ "loss": 11.5219,
305
+ "step": 1900
306
+ },
307
+ {
308
+ "epoch": 3.541901867129831,
309
+ "grad_norm": 6.633806228637695,
310
+ "learning_rate": 0.0005532688464561429,
311
+ "loss": 11.3874,
312
+ "step": 1950
313
+ },
314
+ {
315
+ "epoch": 3.6287451150673036,
316
+ "grad_norm": 6.791120529174805,
317
+ "learning_rate": 0.0005449734908931053,
318
+ "loss": 11.2119,
319
+ "step": 2000
320
+ },
321
+ {
322
+ "epoch": 3.7155883630047764,
323
+ "grad_norm": 5.996912002563477,
324
+ "learning_rate": 0.0005365161811160892,
325
+ "loss": 11.0684,
326
+ "step": 2050
327
+ },
328
+ {
329
+ "epoch": 3.802431610942249,
330
+ "grad_norm": 5.432217121124268,
331
+ "learning_rate": 0.0005279039421670681,
332
+ "loss": 10.9551,
333
+ "step": 2100
334
+ },
335
+ {
336
+ "epoch": 3.847590099869735,
337
+ "eval_accuracy": 0.0,
338
+ "eval_loss": 8.056244850158691,
339
+ "eval_normalizer": 966112.0,
340
+ "eval_runtime": 115.2236,
341
+ "eval_samples_per_second": 511.614,
342
+ "eval_steps_per_second": 1.007,
343
+ "step": 2126
344
+ }
345
+ ],
346
+ "logging_steps": 50,
347
+ "max_steps": 5750,
348
+ "num_input_tokens_seen": 0,
349
+ "num_train_epochs": 10,
350
+ "save_steps": 500,
351
+ "stateful_callbacks": {
352
+ "TrainerControl": {
353
+ "args": {
354
+ "should_epoch_stop": false,
355
+ "should_evaluate": false,
356
+ "should_log": false,
357
+ "should_save": true,
358
+ "should_training_stop": false
359
+ },
360
+ "attributes": {}
361
+ }
362
+ },
363
+ "total_flos": 1.6549240745091226e+17,
364
+ "train_batch_size": 128,
365
+ "trial_name": null,
366
+ "trial_params": null
367
+ }
checkpoints/checkpoint-2126/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa9a830de5c611a0b03c8b33ab915cf89010e60abaf6d5647f96cb57e683f0b5
3
+ size 5432