Plasmoxy commited on
Commit
8155e6e
·
verified ·
1 Parent(s): 3ed999d

End of training

Browse files
README.md CHANGED
@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
14
 
15
  This model was trained from scratch on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
- - Loss: 6.9342
18
 
19
  ## Model description
20
 
@@ -39,41 +39,21 @@ The following hyperparameters were used during training:
39
  - seed: 42
40
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
41
  - lr_scheduler_type: linear
42
- - num_epochs: 3
43
 
44
  ### Training results
45
 
46
  | Training Loss | Epoch | Step | Validation Loss |
47
  |:-------------:|:------:|:-----:|:---------------:|
48
- | 8.9413 | 0.1015 | 3000 | 7.2691 |
49
- | 7.9891 | 0.2030 | 6000 | 7.1297 |
50
- | 7.8872 | 0.3044 | 9000 | 7.0543 |
51
- | 7.8435 | 0.4059 | 12000 | 7.0230 |
52
- | 7.8154 | 0.5074 | 15000 | 6.9913 |
53
- | 7.8 | 0.6089 | 18000 | 6.9889 |
54
- | 7.7871 | 0.7104 | 21000 | 6.9722 |
55
- | 7.7788 | 0.8119 | 24000 | 6.9650 |
56
- | 7.7703 | 0.9133 | 27000 | 6.9571 |
57
- | 7.7673 | 1.0148 | 30000 | 6.9502 |
58
- | 7.7648 | 1.1163 | 33000 | 6.9452 |
59
- | 7.759 | 1.2178 | 36000 | 6.9439 |
60
- | 7.7526 | 1.3193 | 39000 | 6.9407 |
61
- | 7.7542 | 1.4207 | 42000 | 6.9415 |
62
- | 7.7581 | 1.5222 | 45000 | 6.9381 |
63
- | 7.7523 | 1.6237 | 48000 | 6.9360 |
64
- | 7.7533 | 1.7252 | 51000 | 6.9367 |
65
- | 7.7484 | 1.8267 | 54000 | 6.9364 |
66
- | 7.7508 | 1.9282 | 57000 | 6.9341 |
67
- | 7.7513 | 2.0296 | 60000 | 6.9370 |
68
- | 7.7527 | 2.1311 | 63000 | 6.9352 |
69
- | 7.7497 | 2.2326 | 66000 | 6.9347 |
70
- | 7.7499 | 2.3341 | 69000 | 6.9355 |
71
- | 7.7476 | 2.4356 | 72000 | 6.9353 |
72
- | 7.7498 | 2.5370 | 75000 | 6.9355 |
73
- | 7.7509 | 2.6385 | 78000 | 6.9350 |
74
- | 7.7484 | 2.7400 | 81000 | 6.9341 |
75
- | 7.7488 | 2.8415 | 84000 | 6.9343 |
76
- | 7.7486 | 2.9430 | 87000 | 6.9342 |
77
 
78
 
79
  ### Framework versions
 
14
 
15
  This model was trained from scratch on an unknown dataset.
16
  It achieves the following results on the evaluation set:
17
+ - Loss: 7.3662
18
 
19
  ## Model description
20
 
 
39
  - seed: 42
40
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
41
  - lr_scheduler_type: linear
42
+ - num_epochs: 1
43
 
44
  ### Training results
45
 
46
  | Training Loss | Epoch | Step | Validation Loss |
47
  |:-------------:|:------:|:-----:|:---------------:|
48
+ | 9.2307 | 0.1015 | 3000 | 7.5733 |
49
+ | 8.3153 | 0.2030 | 6000 | 7.4610 |
50
+ | 8.245 | 0.3044 | 9000 | 7.4058 |
51
+ | 8.2157 | 0.4059 | 12000 | 7.3872 |
52
+ | 8.2014 | 0.5074 | 15000 | 7.3764 |
53
+ | 8.1962 | 0.6089 | 18000 | 7.3736 |
54
+ | 8.1911 | 0.7104 | 21000 | 7.3708 |
55
+ | 8.1927 | 0.8119 | 24000 | 7.3668 |
56
+ | 8.1863 | 0.9133 | 27000 | 7.3662 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  ### Framework versions
config.json CHANGED
@@ -28,5 +28,5 @@
28
  "torch_dtype": "bfloat16",
29
  "transformers_version": "4.45.2",
30
  "use_cache": true,
31
- "vocab_size": 160906
32
  }
 
28
  "torch_dtype": "bfloat16",
29
  "transformers_version": "4.45.2",
30
  "use_cache": true,
31
+ "vocab_size": 230895
32
  }
coreconfig.json CHANGED
@@ -15,7 +15,7 @@
15
  "max_target_length": 35,
16
  "batch_size": 128,
17
  "learning_rate": 0.0003,
18
- "num_train_epochs": 3,
19
  "pkg_versions": {
20
  "optimum": "1.23.3",
21
  "transformers": "4.45.2",
 
15
  "max_target_length": 35,
16
  "batch_size": 128,
17
  "learning_rate": 0.0003,
18
+ "num_train_epochs": 1,
19
  "pkg_versions": {
20
  "optimum": "1.23.3",
21
  "transformers": "4.45.2",
logs/events.out.tfevents.1745754496.gna4000.1514881.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42510c0be55e23debf3754d832ea80e80e2703ecf2e783ddb93a528bc730a066
3
+ size 5489
logs/events.out.tfevents.1745754534.gna4000.1515101.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0afdfb0f22df4b1b189d7f2619cf80bc41876cc3976f2dab66e4fc6a554bdd72
3
+ size 10223
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:33aac971af3cebbb6e9652c4de0402f44677e55ec7bcbb3b1862819292ad88c1
3
- size 417682096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fd0cd85bfc88e3cc9d85dc4b00ee9584dcc94036b7925a2b46ceaf51e976c5a
3
+ size 561019568
nvidia_smi_early.log CHANGED
@@ -1,4 +1,4 @@
1
- Sun Apr 27 01:53:08 2025
2
  +-----------------------------------------------------------------------------------------+
3
  | NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |
4
  |-----------------------------------------+------------------------+----------------------+
@@ -7,7 +7,7 @@ Sun Apr 27 01:53:08 2025
7
  | | | MIG M. |
8
  |=========================================+========================+======================|
9
  | 0 NVIDIA RTX A4000 Off | 00000000:01:00.0 Off | Off |
10
- | 65% 87C P2 136W / 140W | 14367MiB / 16376MiB | 100% Default |
11
  | | | N/A |
12
  +-----------------------------------------+------------------------+----------------------+
13
 
@@ -17,6 +17,5 @@ Sun Apr 27 01:53:08 2025
17
  | ID ID Usage |
18
  |=========================================================================================|
19
  | 0 N/A N/A 1927 G /usr/lib/xorg/Xorg 4MiB |
20
- | 0 N/A N/A 1504681 C ...ik/miniforge3/envs/alpha/bin/python 1978MiB |
21
- | 0 N/A N/A 1505732 C ...ik/miniforge3/envs/alpha/bin/python 12372MiB |
22
  +-----------------------------------------------------------------------------------------+
 
1
+ Sun Apr 27 13:49:24 2025
2
  +-----------------------------------------------------------------------------------------+
3
  | NVIDIA-SMI 550.144.03 Driver Version: 550.144.03 CUDA Version: 12.4 |
4
  |-----------------------------------------+------------------------+----------------------+
 
7
  | | | MIG M. |
8
  |=========================================+========================+======================|
9
  | 0 NVIDIA RTX A4000 Off | 00000000:01:00.0 Off | Off |
10
+ | 53% 76C P2 137W / 140W | 14847MiB / 16376MiB | 100% Default |
11
  | | | N/A |
12
  +-----------------------------------------+------------------------+----------------------+
13
 
 
17
  | ID ID Usage |
18
  |=========================================================================================|
19
  | 0 N/A N/A 1927 G /usr/lib/xorg/Xorg 4MiB |
20
+ | 0 N/A N/A 1515101 C ...ik/miniforge3/envs/alpha/bin/python 14832MiB |
 
21
  +-----------------------------------------------------------------------------------------+
trainer_state.json CHANGED
@@ -1,469 +1,169 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 3000,
6
- "global_step": 88686,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 3.382721060821325e-05,
13
- "grad_norm": 79.0,
14
- "learning_rate": 0.00029999661727893914,
15
- "loss": 46.75,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.10148163182463973,
20
- "grad_norm": 1.6875,
21
- "learning_rate": 0.000289851836817536,
22
- "loss": 8.9413,
23
  "step": 3000
24
  },
25
  {
26
  "epoch": 0.10148163182463973,
27
- "eval_loss": 7.269054889678955,
28
- "eval_runtime": 84.8535,
29
- "eval_samples_per_second": 1112.565,
30
- "eval_steps_per_second": 8.697,
31
  "step": 3000
32
  },
33
  {
34
  "epoch": 0.20296326364927947,
35
- "grad_norm": 2.109375,
36
- "learning_rate": 0.000279703673635072,
37
- "loss": 7.9891,
38
  "step": 6000
39
  },
40
  {
41
  "epoch": 0.20296326364927947,
42
- "eval_loss": 7.129700183868408,
43
- "eval_runtime": 84.9004,
44
- "eval_samples_per_second": 1111.95,
45
- "eval_steps_per_second": 8.693,
46
  "step": 6000
47
  },
48
  {
49
  "epoch": 0.30444489547391923,
50
- "grad_norm": 1.609375,
51
- "learning_rate": 0.00026955551045260807,
52
- "loss": 7.8872,
53
  "step": 9000
54
  },
55
  {
56
  "epoch": 0.30444489547391923,
57
- "eval_loss": 7.054327487945557,
58
- "eval_runtime": 84.8699,
59
- "eval_samples_per_second": 1112.349,
60
- "eval_steps_per_second": 8.696,
61
  "step": 9000
62
  },
63
  {
64
  "epoch": 0.40592652729855894,
65
- "grad_norm": 1.7734375,
66
- "learning_rate": 0.0002594073472701441,
67
- "loss": 7.8435,
68
  "step": 12000
69
  },
70
  {
71
  "epoch": 0.40592652729855894,
72
- "eval_loss": 7.023035049438477,
73
- "eval_runtime": 84.8887,
74
- "eval_samples_per_second": 1112.103,
75
- "eval_steps_per_second": 8.694,
76
  "step": 12000
77
  },
78
  {
79
  "epoch": 0.5074081591231987,
80
- "grad_norm": 1.5703125,
81
- "learning_rate": 0.0002492591840876801,
82
- "loss": 7.8154,
83
  "step": 15000
84
  },
85
  {
86
  "epoch": 0.5074081591231987,
87
- "eval_loss": 6.99131965637207,
88
- "eval_runtime": 84.8972,
89
- "eval_samples_per_second": 1111.992,
90
- "eval_steps_per_second": 8.693,
91
  "step": 15000
92
  },
93
  {
94
  "epoch": 0.6088897909478385,
95
- "grad_norm": 1.515625,
96
- "learning_rate": 0.0002391110209052161,
97
- "loss": 7.8,
98
  "step": 18000
99
  },
100
  {
101
  "epoch": 0.6088897909478385,
102
- "eval_loss": 6.988863468170166,
103
- "eval_runtime": 84.8916,
104
- "eval_samples_per_second": 1112.065,
105
- "eval_steps_per_second": 8.693,
106
  "step": 18000
107
  },
108
  {
109
  "epoch": 0.7103714227724782,
110
- "grad_norm": 1.8359375,
111
- "learning_rate": 0.00022896285772275215,
112
- "loss": 7.7871,
113
  "step": 21000
114
  },
115
  {
116
  "epoch": 0.7103714227724782,
117
- "eval_loss": 6.972222328186035,
118
- "eval_runtime": 84.8906,
119
- "eval_samples_per_second": 1112.078,
120
- "eval_steps_per_second": 8.694,
121
  "step": 21000
122
  },
123
  {
124
  "epoch": 0.8118530545971179,
125
- "grad_norm": 1.6328125,
126
- "learning_rate": 0.0002188146945402882,
127
- "loss": 7.7788,
128
  "step": 24000
129
  },
130
  {
131
  "epoch": 0.8118530545971179,
132
- "eval_loss": 6.965023517608643,
133
- "eval_runtime": 85.0087,
134
- "eval_samples_per_second": 1110.533,
135
- "eval_steps_per_second": 8.681,
136
  "step": 24000
137
  },
138
  {
139
  "epoch": 0.9133346864217576,
140
- "grad_norm": 1.7421875,
141
- "learning_rate": 0.00020866653135782423,
142
- "loss": 7.7703,
143
  "step": 27000
144
  },
145
  {
146
  "epoch": 0.9133346864217576,
147
- "eval_loss": 6.9570631980896,
148
- "eval_runtime": 84.9952,
149
- "eval_samples_per_second": 1110.71,
150
- "eval_steps_per_second": 8.683,
151
  "step": 27000
152
  },
153
  {
154
- "epoch": 1.0148163182463974,
155
- "grad_norm": 1.8359375,
156
- "learning_rate": 0.00019851836817536025,
157
- "loss": 7.7673,
158
- "step": 30000
159
- },
160
- {
161
- "epoch": 1.0148163182463974,
162
- "eval_loss": 6.950160980224609,
163
- "eval_runtime": 84.9886,
164
- "eval_samples_per_second": 1110.796,
165
- "eval_steps_per_second": 8.684,
166
- "step": 30000
167
- },
168
- {
169
- "epoch": 1.116297950071037,
170
- "grad_norm": 1.859375,
171
- "learning_rate": 0.0001883702049928963,
172
- "loss": 7.7648,
173
- "step": 33000
174
- },
175
- {
176
- "epoch": 1.116297950071037,
177
- "eval_loss": 6.945206642150879,
178
- "eval_runtime": 84.9336,
179
- "eval_samples_per_second": 1111.515,
180
- "eval_steps_per_second": 8.689,
181
- "step": 33000
182
- },
183
- {
184
- "epoch": 1.217779581895677,
185
- "grad_norm": 1.7421875,
186
- "learning_rate": 0.0001782220418104323,
187
- "loss": 7.759,
188
- "step": 36000
189
- },
190
- {
191
- "epoch": 1.217779581895677,
192
- "eval_loss": 6.943936347961426,
193
- "eval_runtime": 85.0015,
194
- "eval_samples_per_second": 1110.627,
195
- "eval_steps_per_second": 8.682,
196
- "step": 36000
197
- },
198
- {
199
- "epoch": 1.3192612137203166,
200
- "grad_norm": 1.671875,
201
- "learning_rate": 0.00016807387862796832,
202
- "loss": 7.7526,
203
- "step": 39000
204
- },
205
- {
206
- "epoch": 1.3192612137203166,
207
- "eval_loss": 6.940675735473633,
208
- "eval_runtime": 85.0105,
209
- "eval_samples_per_second": 1110.51,
210
- "eval_steps_per_second": 8.681,
211
- "step": 39000
212
- },
213
- {
214
- "epoch": 1.4207428455449564,
215
- "grad_norm": 1.6640625,
216
- "learning_rate": 0.00015792571544550436,
217
- "loss": 7.7542,
218
- "step": 42000
219
- },
220
- {
221
- "epoch": 1.4207428455449564,
222
- "eval_loss": 6.941522598266602,
223
- "eval_runtime": 84.8589,
224
- "eval_samples_per_second": 1112.494,
225
- "eval_steps_per_second": 8.697,
226
- "step": 42000
227
- },
228
- {
229
- "epoch": 1.522224477369596,
230
- "grad_norm": 1.640625,
231
- "learning_rate": 0.00014777755226304037,
232
- "loss": 7.7581,
233
- "step": 45000
234
- },
235
- {
236
- "epoch": 1.522224477369596,
237
- "eval_loss": 6.938050270080566,
238
- "eval_runtime": 84.8679,
239
- "eval_samples_per_second": 1112.375,
240
- "eval_steps_per_second": 8.696,
241
- "step": 45000
242
- },
243
- {
244
- "epoch": 1.6237061091942357,
245
- "grad_norm": 1.6640625,
246
- "learning_rate": 0.0001376293890805764,
247
- "loss": 7.7523,
248
- "step": 48000
249
- },
250
- {
251
- "epoch": 1.6237061091942357,
252
- "eval_loss": 6.935975551605225,
253
- "eval_runtime": 84.8729,
254
- "eval_samples_per_second": 1112.31,
255
- "eval_steps_per_second": 8.695,
256
- "step": 48000
257
- },
258
- {
259
- "epoch": 1.7251877410188756,
260
- "grad_norm": 1.75,
261
- "learning_rate": 0.00012748122589811243,
262
- "loss": 7.7533,
263
- "step": 51000
264
- },
265
- {
266
- "epoch": 1.7251877410188756,
267
- "eval_loss": 6.936653137207031,
268
- "eval_runtime": 84.9125,
269
- "eval_samples_per_second": 1111.791,
270
- "eval_steps_per_second": 8.691,
271
- "step": 51000
272
- },
273
- {
274
- "epoch": 1.8266693728435153,
275
- "grad_norm": 1.59375,
276
- "learning_rate": 0.00011733306271564845,
277
- "loss": 7.7484,
278
- "step": 54000
279
- },
280
- {
281
- "epoch": 1.8266693728435153,
282
- "eval_loss": 6.936398983001709,
283
- "eval_runtime": 84.9096,
284
- "eval_samples_per_second": 1111.83,
285
- "eval_steps_per_second": 8.692,
286
- "step": 54000
287
- },
288
- {
289
- "epoch": 1.928151004668155,
290
- "grad_norm": 1.5390625,
291
- "learning_rate": 0.00010718489953318448,
292
- "loss": 7.7508,
293
- "step": 57000
294
- },
295
- {
296
- "epoch": 1.928151004668155,
297
- "eval_loss": 6.934112548828125,
298
- "eval_runtime": 84.9086,
299
- "eval_samples_per_second": 1111.843,
300
- "eval_steps_per_second": 8.692,
301
- "step": 57000
302
- },
303
- {
304
- "epoch": 2.029632636492795,
305
- "grad_norm": 1.7578125,
306
- "learning_rate": 9.703673635072052e-05,
307
- "loss": 7.7513,
308
- "step": 60000
309
- },
310
- {
311
- "epoch": 2.029632636492795,
312
- "eval_loss": 6.9370341300964355,
313
- "eval_runtime": 84.9523,
314
- "eval_samples_per_second": 1111.271,
315
- "eval_steps_per_second": 8.687,
316
- "step": 60000
317
- },
318
- {
319
- "epoch": 2.1311142683174347,
320
- "grad_norm": 1.6328125,
321
- "learning_rate": 8.688857316825655e-05,
322
- "loss": 7.7527,
323
- "step": 63000
324
- },
325
- {
326
- "epoch": 2.1311142683174347,
327
- "eval_loss": 6.935213565826416,
328
- "eval_runtime": 84.9291,
329
- "eval_samples_per_second": 1111.574,
330
- "eval_steps_per_second": 8.69,
331
- "step": 63000
332
- },
333
- {
334
- "epoch": 2.232595900142074,
335
- "grad_norm": 2.015625,
336
- "learning_rate": 7.674040998579256e-05,
337
- "loss": 7.7497,
338
- "step": 66000
339
- },
340
- {
341
- "epoch": 2.232595900142074,
342
- "eval_loss": 6.9347052574157715,
343
- "eval_runtime": 84.9721,
344
- "eval_samples_per_second": 1111.011,
345
- "eval_steps_per_second": 8.685,
346
- "step": 66000
347
- },
348
- {
349
- "epoch": 2.334077531966714,
350
- "grad_norm": 1.609375,
351
- "learning_rate": 6.659224680332859e-05,
352
- "loss": 7.7499,
353
- "step": 69000
354
- },
355
- {
356
- "epoch": 2.334077531966714,
357
- "eval_loss": 6.93546724319458,
358
- "eval_runtime": 85.0884,
359
- "eval_samples_per_second": 1109.493,
360
- "eval_steps_per_second": 8.673,
361
- "step": 69000
362
- },
363
- {
364
- "epoch": 2.435559163791354,
365
- "grad_norm": 1.515625,
366
- "learning_rate": 5.644408362086462e-05,
367
- "loss": 7.7476,
368
- "step": 72000
369
- },
370
- {
371
- "epoch": 2.435559163791354,
372
- "eval_loss": 6.935297966003418,
373
- "eval_runtime": 85.0655,
374
- "eval_samples_per_second": 1109.791,
375
- "eval_steps_per_second": 8.676,
376
- "step": 72000
377
- },
378
- {
379
- "epoch": 2.5370407956159937,
380
- "grad_norm": 1.6875,
381
- "learning_rate": 4.629592043840065e-05,
382
- "loss": 7.7498,
383
- "step": 75000
384
- },
385
- {
386
- "epoch": 2.5370407956159937,
387
- "eval_loss": 6.93550968170166,
388
- "eval_runtime": 85.1217,
389
- "eval_samples_per_second": 1109.059,
390
- "eval_steps_per_second": 8.67,
391
- "step": 75000
392
- },
393
- {
394
- "epoch": 2.638522427440633,
395
- "grad_norm": 1.5625,
396
- "learning_rate": 3.614775725593667e-05,
397
- "loss": 7.7509,
398
- "step": 78000
399
- },
400
- {
401
- "epoch": 2.638522427440633,
402
- "eval_loss": 6.935043811798096,
403
- "eval_runtime": 85.1168,
404
- "eval_samples_per_second": 1109.123,
405
- "eval_steps_per_second": 8.67,
406
- "step": 78000
407
- },
408
- {
409
- "epoch": 2.740004059265273,
410
- "grad_norm": 1.6171875,
411
- "learning_rate": 2.59995940734727e-05,
412
- "loss": 7.7484,
413
- "step": 81000
414
- },
415
- {
416
- "epoch": 2.740004059265273,
417
- "eval_loss": 6.934070110321045,
418
- "eval_runtime": 85.1525,
419
- "eval_samples_per_second": 1108.658,
420
- "eval_steps_per_second": 8.667,
421
- "step": 81000
422
- },
423
- {
424
- "epoch": 2.841485691089913,
425
- "grad_norm": 1.671875,
426
- "learning_rate": 1.5851430891008727e-05,
427
- "loss": 7.7488,
428
- "step": 84000
429
- },
430
- {
431
- "epoch": 2.841485691089913,
432
- "eval_loss": 6.934324264526367,
433
- "eval_runtime": 85.1433,
434
- "eval_samples_per_second": 1108.778,
435
- "eval_steps_per_second": 8.668,
436
- "step": 84000
437
- },
438
- {
439
- "epoch": 2.9429673229145523,
440
- "grad_norm": 1.671875,
441
- "learning_rate": 5.703267708544753e-06,
442
- "loss": 7.7486,
443
- "step": 87000
444
- },
445
- {
446
- "epoch": 2.9429673229145523,
447
- "eval_loss": 6.934196949005127,
448
- "eval_runtime": 85.1307,
449
- "eval_samples_per_second": 1108.941,
450
- "eval_steps_per_second": 8.669,
451
- "step": 87000
452
- },
453
- {
454
- "epoch": 3.0,
455
- "step": 88686,
456
- "total_flos": 9.645499494395412e+17,
457
- "train_loss": 7.815265022100444,
458
- "train_runtime": 33323.0229,
459
- "train_samples_per_second": 340.649,
460
- "train_steps_per_second": 2.661
461
  }
462
  ],
463
  "logging_steps": 3000,
464
- "max_steps": 88686,
465
  "num_input_tokens_seen": 0,
466
- "num_train_epochs": 3,
467
  "save_steps": 3000,
468
  "stateful_callbacks": {
469
  "TrainerControl": {
@@ -477,7 +177,7 @@
477
  "attributes": {}
478
  }
479
  },
480
- "total_flos": 9.645499494395412e+17,
481
  "train_batch_size": 128,
482
  "trial_name": null,
483
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 3000,
6
+ "global_step": 29562,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 3.382721060821325e-05,
13
+ "grad_norm": 86.5,
14
+ "learning_rate": 0.0002999898518368175,
15
+ "loss": 50.5,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.10148163182463973,
20
+ "grad_norm": 1.484375,
21
+ "learning_rate": 0.00026955551045260807,
22
+ "loss": 9.2307,
23
  "step": 3000
24
  },
25
  {
26
  "epoch": 0.10148163182463973,
27
+ "eval_loss": 7.57329797744751,
28
+ "eval_runtime": 107.8469,
29
+ "eval_samples_per_second": 875.361,
30
+ "eval_steps_per_second": 6.843,
31
  "step": 3000
32
  },
33
  {
34
  "epoch": 0.20296326364927947,
35
+ "grad_norm": 1.75,
36
+ "learning_rate": 0.0002391110209052161,
37
+ "loss": 8.3153,
38
  "step": 6000
39
  },
40
  {
41
  "epoch": 0.20296326364927947,
42
+ "eval_loss": 7.460958480834961,
43
+ "eval_runtime": 107.9107,
44
+ "eval_samples_per_second": 874.844,
45
+ "eval_steps_per_second": 6.839,
46
  "step": 6000
47
  },
48
  {
49
  "epoch": 0.30444489547391923,
50
+ "grad_norm": 1.953125,
51
+ "learning_rate": 0.00020866653135782423,
52
+ "loss": 8.245,
53
  "step": 9000
54
  },
55
  {
56
  "epoch": 0.30444489547391923,
57
+ "eval_loss": 7.405826568603516,
58
+ "eval_runtime": 107.9533,
59
+ "eval_samples_per_second": 874.498,
60
+ "eval_steps_per_second": 6.836,
61
  "step": 9000
62
  },
63
  {
64
  "epoch": 0.40592652729855894,
65
+ "grad_norm": 1.40625,
66
+ "learning_rate": 0.0001782220418104323,
67
+ "loss": 8.2157,
68
  "step": 12000
69
  },
70
  {
71
  "epoch": 0.40592652729855894,
72
+ "eval_loss": 7.387195110321045,
73
+ "eval_runtime": 107.9361,
74
+ "eval_samples_per_second": 874.638,
75
+ "eval_steps_per_second": 6.837,
76
  "step": 12000
77
  },
78
  {
79
  "epoch": 0.5074081591231987,
80
+ "grad_norm": 2.375,
81
+ "learning_rate": 0.00014777755226304037,
82
+ "loss": 8.2014,
83
  "step": 15000
84
  },
85
  {
86
  "epoch": 0.5074081591231987,
87
+ "eval_loss": 7.376355171203613,
88
+ "eval_runtime": 107.9488,
89
+ "eval_samples_per_second": 874.535,
90
+ "eval_steps_per_second": 6.837,
91
  "step": 15000
92
  },
93
  {
94
  "epoch": 0.6088897909478385,
95
+ "grad_norm": 1.5,
96
+ "learning_rate": 0.00011733306271564845,
97
+ "loss": 8.1962,
98
  "step": 18000
99
  },
100
  {
101
  "epoch": 0.6088897909478385,
102
+ "eval_loss": 7.373560428619385,
103
+ "eval_runtime": 107.921,
104
+ "eval_samples_per_second": 874.76,
105
+ "eval_steps_per_second": 6.838,
106
  "step": 18000
107
  },
108
  {
109
  "epoch": 0.7103714227724782,
110
+ "grad_norm": 1.484375,
111
+ "learning_rate": 8.688857316825655e-05,
112
+ "loss": 8.1911,
113
  "step": 21000
114
  },
115
  {
116
  "epoch": 0.7103714227724782,
117
+ "eval_loss": 7.370765686035156,
118
+ "eval_runtime": 108.0558,
119
+ "eval_samples_per_second": 873.669,
120
+ "eval_steps_per_second": 6.83,
121
  "step": 21000
122
  },
123
  {
124
  "epoch": 0.8118530545971179,
125
+ "grad_norm": 1.5078125,
126
+ "learning_rate": 5.644408362086462e-05,
127
+ "loss": 8.1927,
128
  "step": 24000
129
  },
130
  {
131
  "epoch": 0.8118530545971179,
132
+ "eval_loss": 7.366785049438477,
133
+ "eval_runtime": 108.0458,
134
+ "eval_samples_per_second": 873.75,
135
+ "eval_steps_per_second": 6.83,
136
  "step": 24000
137
  },
138
  {
139
  "epoch": 0.9133346864217576,
140
+ "grad_norm": 1.8125,
141
+ "learning_rate": 2.59995940734727e-05,
142
+ "loss": 8.1863,
143
  "step": 27000
144
  },
145
  {
146
  "epoch": 0.9133346864217576,
147
+ "eval_loss": 7.36619234085083,
148
+ "eval_runtime": 108.0404,
149
+ "eval_samples_per_second": 873.793,
150
+ "eval_steps_per_second": 6.831,
151
  "step": 27000
152
  },
153
  {
154
+ "epoch": 1.0,
155
+ "step": 29562,
156
+ "total_flos": 4.126336903687864e+17,
157
+ "train_loss": 8.319568829916784,
158
+ "train_runtime": 13641.1405,
159
+ "train_samples_per_second": 277.383,
160
+ "train_steps_per_second": 2.167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  }
162
  ],
163
  "logging_steps": 3000,
164
+ "max_steps": 29562,
165
  "num_input_tokens_seen": 0,
166
+ "num_train_epochs": 1,
167
  "save_steps": 3000,
168
  "stateful_callbacks": {
169
  "TrainerControl": {
 
177
  "attributes": {}
178
  }
179
  },
180
+ "total_flos": 4.126336903687864e+17,
181
  "train_batch_size": 128,
182
  "trial_name": null,
183
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7dfa9f6d96f97f77642d772f1add97a44e9d981a1457ef9c6abcac4bd9fae22e
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c59cb6570e6cfaa5146fb53f8f126100f394dcdef7e0476d8633ec233dc03dae
3
  size 5368
training_args.json CHANGED
@@ -20,7 +20,7 @@
20
  "adam_beta2": 0.999,
21
  "adam_epsilon": 1e-08,
22
  "max_grad_norm": 1.0,
23
- "num_train_epochs": 3,
24
  "max_steps": -1,
25
  "lr_scheduler_type": "linear",
26
  "lr_scheduler_kwargs": {},
 
20
  "adam_beta2": 0.999,
21
  "adam_epsilon": 1e-08,
22
  "max_grad_norm": 1.0,
23
+ "num_train_epochs": 1,
24
  "max_steps": -1,
25
  "lr_scheduler_type": "linear",
26
  "lr_scheduler_kwargs": {},