dq158 commited on
Commit
30d758e
·
1 Parent(s): e4f8f51

Training in progress, epoch 1, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "morbius",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
1
  {
2
+ "_name_or_path": "google/flan-t5-base",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bc5f7966ab751e227a273a7acdbf9909b12ac56502470ca806ef6c19af692daa
3
- size 1832
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7a4fd16e9ae253c9afbd45e7aebf5322152587fd9c2246dc052d32c0c14bc1f
3
+ size 1980860410
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:459187e110db4e0fb7a78cfb68b5e2e416fc7bc717e4748019446f5cdf973209
3
  size 990409330
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d3b863c7143522b5aa397705229b15a9b90278f38aa48eade14bd0d2fa1bf8b
3
  size 990409330
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:590ca6cf7a51bd34cebcce091bf7b19ed8c4800b52fae63fdebbcb234b0e6d1d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97ee14b3a48ed76d753441ec98ccbc21dde8e48d113653f330e853b90a7a2080
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1860c7c6ca1e473d03a7cae1a98bbfbcae9f1a1ba246b538abe04ba78364f93
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11766d8e6152c9f94c305c691168bbd6ac221c7a0d399a598ff357844e021322
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,429 +1,44 @@
1
  {
2
- "best_metric": NaN,
3
- "best_model_checkpoint": "dq158/morbius/checkpoint-144",
4
- "epoch": 19.913569576490925,
5
  "eval_steps": 500,
6
- "global_step": 2880,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_bleu": 1.0,
14
- "eval_brevity_penalty": 1.0,
15
- "eval_length_ratio": 1.0,
16
- "eval_loss": NaN,
17
- "eval_precisions": [
18
- 1.0,
19
- 1.0,
20
- 1.0,
21
- 1.0
22
- ],
23
- "eval_reference_length": 18771,
24
- "eval_runtime": 122.4923,
25
- "eval_samples_per_second": 8.392,
26
- "eval_steps_per_second": 1.053,
27
- "eval_translation_length": 18771,
28
- "step": 144
29
- },
30
- {
31
- "epoch": 2.0,
32
- "eval_bleu": 1.0,
33
- "eval_brevity_penalty": 1.0,
34
- "eval_length_ratio": 1.0,
35
- "eval_loss": NaN,
36
- "eval_precisions": [
37
- 1.0,
38
- 1.0,
39
- 1.0,
40
- 1.0
41
- ],
42
- "eval_reference_length": 18771,
43
- "eval_runtime": 122.6086,
44
- "eval_samples_per_second": 8.384,
45
- "eval_steps_per_second": 1.052,
46
- "eval_translation_length": 18771,
47
- "step": 289
48
- },
49
- {
50
- "epoch": 2.99,
51
- "eval_bleu": 1.0,
52
- "eval_brevity_penalty": 1.0,
53
- "eval_length_ratio": 1.0,
54
- "eval_loss": NaN,
55
- "eval_precisions": [
56
- 1.0,
57
- 1.0,
58
- 1.0,
59
- 1.0
60
- ],
61
- "eval_reference_length": 18771,
62
- "eval_runtime": 122.4683,
63
- "eval_samples_per_second": 8.394,
64
- "eval_steps_per_second": 1.053,
65
- "eval_translation_length": 18771,
66
- "step": 433
67
- },
68
- {
69
- "epoch": 3.46,
70
- "learning_rate": 1e-05,
71
- "loss": 0.0,
72
  "step": 500
73
  },
74
  {
75
- "epoch": 4.0,
76
- "eval_bleu": 1.0,
77
- "eval_brevity_penalty": 1.0,
78
- "eval_length_ratio": 1.0,
79
- "eval_loss": NaN,
80
- "eval_precisions": [
81
- 1.0,
82
- 1.0,
83
- 1.0,
84
- 1.0
85
- ],
86
- "eval_reference_length": 18771,
87
- "eval_runtime": 122.621,
88
- "eval_samples_per_second": 8.384,
89
- "eval_steps_per_second": 1.052,
90
- "eval_translation_length": 18771,
91
- "step": 578
92
- },
93
- {
94
- "epoch": 5.0,
95
- "eval_bleu": 1.0,
96
- "eval_brevity_penalty": 1.0,
97
- "eval_length_ratio": 1.0,
98
- "eval_loss": NaN,
99
- "eval_precisions": [
100
- 1.0,
101
- 1.0,
102
- 1.0,
103
- 1.0
104
- ],
105
- "eval_reference_length": 18771,
106
- "eval_runtime": 122.3833,
107
- "eval_samples_per_second": 8.4,
108
- "eval_steps_per_second": 1.054,
109
- "eval_translation_length": 18771,
110
- "step": 723
111
- },
112
- {
113
- "epoch": 5.99,
114
- "eval_bleu": 1.0,
115
- "eval_brevity_penalty": 1.0,
116
- "eval_length_ratio": 1.0,
117
- "eval_loss": NaN,
118
- "eval_precisions": [
119
- 1.0,
120
- 1.0,
121
- 1.0,
122
- 1.0
123
- ],
124
- "eval_reference_length": 18771,
125
- "eval_runtime": 122.7364,
126
- "eval_samples_per_second": 8.376,
127
- "eval_steps_per_second": 1.051,
128
- "eval_translation_length": 18771,
129
- "step": 867
130
- },
131
- {
132
- "epoch": 6.91,
133
- "learning_rate": 1e-05,
134
- "loss": 0.0,
135
- "step": 1000
136
- },
137
- {
138
- "epoch": 7.0,
139
- "eval_bleu": 1.0,
140
- "eval_brevity_penalty": 1.0,
141
- "eval_length_ratio": 1.0,
142
- "eval_loss": NaN,
143
- "eval_precisions": [
144
- 1.0,
145
- 1.0,
146
- 1.0,
147
- 1.0
148
- ],
149
- "eval_reference_length": 18771,
150
- "eval_runtime": 122.9033,
151
- "eval_samples_per_second": 8.364,
152
- "eval_steps_per_second": 1.05,
153
- "eval_translation_length": 18771,
154
- "step": 1012
155
- },
156
- {
157
- "epoch": 8.0,
158
- "eval_bleu": 1.0,
159
- "eval_brevity_penalty": 1.0,
160
- "eval_length_ratio": 1.0,
161
- "eval_loss": NaN,
162
- "eval_precisions": [
163
- 1.0,
164
- 1.0,
165
- 1.0,
166
- 1.0
167
- ],
168
- "eval_reference_length": 18771,
169
- "eval_runtime": 123.0241,
170
- "eval_samples_per_second": 8.356,
171
- "eval_steps_per_second": 1.049,
172
- "eval_translation_length": 18771,
173
- "step": 1157
174
- },
175
- {
176
- "epoch": 9.0,
177
- "eval_bleu": 1.0,
178
- "eval_brevity_penalty": 1.0,
179
- "eval_length_ratio": 1.0,
180
- "eval_loss": NaN,
181
- "eval_precisions": [
182
- 1.0,
183
- 1.0,
184
- 1.0,
185
- 1.0
186
- ],
187
- "eval_reference_length": 18771,
188
- "eval_runtime": 122.4135,
189
- "eval_samples_per_second": 8.398,
190
- "eval_steps_per_second": 1.054,
191
- "eval_translation_length": 18771,
192
- "step": 1301
193
- },
194
- {
195
- "epoch": 10.0,
196
- "eval_bleu": 1.0,
197
- "eval_brevity_penalty": 1.0,
198
- "eval_length_ratio": 1.0,
199
- "eval_loss": NaN,
200
- "eval_precisions": [
201
- 1.0,
202
- 1.0,
203
- 1.0,
204
- 1.0
205
- ],
206
- "eval_reference_length": 18771,
207
- "eval_runtime": 122.6454,
208
- "eval_samples_per_second": 8.382,
209
- "eval_steps_per_second": 1.052,
210
- "eval_translation_length": 18771,
211
- "step": 1446
212
- },
213
- {
214
- "epoch": 10.37,
215
- "learning_rate": 1e-05,
216
- "loss": 0.0,
217
- "step": 1500
218
- },
219
- {
220
- "epoch": 10.99,
221
- "eval_bleu": 1.0,
222
- "eval_brevity_penalty": 1.0,
223
- "eval_length_ratio": 1.0,
224
- "eval_loss": NaN,
225
- "eval_precisions": [
226
- 1.0,
227
- 1.0,
228
- 1.0,
229
- 1.0
230
- ],
231
- "eval_reference_length": 18771,
232
- "eval_runtime": 122.2623,
233
- "eval_samples_per_second": 8.408,
234
- "eval_steps_per_second": 1.055,
235
- "eval_translation_length": 18771,
236
- "step": 1590
237
- },
238
- {
239
- "epoch": 12.0,
240
- "eval_bleu": 1.0,
241
- "eval_brevity_penalty": 1.0,
242
- "eval_length_ratio": 1.0,
243
- "eval_loss": NaN,
244
- "eval_precisions": [
245
- 1.0,
246
- 1.0,
247
- 1.0,
248
- 1.0
249
- ],
250
- "eval_reference_length": 18771,
251
- "eval_runtime": 122.4484,
252
- "eval_samples_per_second": 8.395,
253
- "eval_steps_per_second": 1.054,
254
- "eval_translation_length": 18771,
255
- "step": 1735
256
- },
257
- {
258
- "epoch": 13.0,
259
- "eval_bleu": 1.0,
260
- "eval_brevity_penalty": 1.0,
261
- "eval_length_ratio": 1.0,
262
- "eval_loss": NaN,
263
- "eval_precisions": [
264
- 1.0,
265
- 1.0,
266
- 1.0,
267
- 1.0
268
- ],
269
- "eval_reference_length": 18771,
270
- "eval_runtime": 122.9159,
271
- "eval_samples_per_second": 8.363,
272
- "eval_steps_per_second": 1.049,
273
- "eval_translation_length": 18771,
274
- "step": 1880
275
- },
276
- {
277
- "epoch": 13.83,
278
- "learning_rate": 1e-05,
279
- "loss": 0.0,
280
- "step": 2000
281
- },
282
- {
283
- "epoch": 13.99,
284
- "eval_bleu": 1.0,
285
- "eval_brevity_penalty": 1.0,
286
- "eval_length_ratio": 1.0,
287
- "eval_loss": NaN,
288
- "eval_precisions": [
289
- 1.0,
290
- 1.0,
291
- 1.0,
292
- 1.0
293
- ],
294
- "eval_reference_length": 18771,
295
- "eval_runtime": 122.5545,
296
- "eval_samples_per_second": 8.388,
297
- "eval_steps_per_second": 1.053,
298
- "eval_translation_length": 18771,
299
- "step": 2024
300
- },
301
- {
302
- "epoch": 15.0,
303
- "eval_bleu": 1.0,
304
- "eval_brevity_penalty": 1.0,
305
- "eval_length_ratio": 1.0,
306
- "eval_loss": NaN,
307
- "eval_precisions": [
308
- 1.0,
309
- 1.0,
310
- 1.0,
311
- 1.0
312
- ],
313
- "eval_reference_length": 18771,
314
- "eval_runtime": 122.6122,
315
- "eval_samples_per_second": 8.384,
316
- "eval_steps_per_second": 1.052,
317
- "eval_translation_length": 18771,
318
- "step": 2169
319
- },
320
- {
321
- "epoch": 16.0,
322
- "eval_bleu": 1.0,
323
- "eval_brevity_penalty": 1.0,
324
- "eval_length_ratio": 1.0,
325
- "eval_loss": NaN,
326
- "eval_precisions": [
327
- 1.0,
328
- 1.0,
329
- 1.0,
330
- 1.0
331
- ],
332
- "eval_reference_length": 18771,
333
- "eval_runtime": 122.6643,
334
- "eval_samples_per_second": 8.381,
335
- "eval_steps_per_second": 1.052,
336
- "eval_translation_length": 18771,
337
- "step": 2314
338
- },
339
- {
340
- "epoch": 17.0,
341
- "eval_bleu": 1.0,
342
- "eval_brevity_penalty": 1.0,
343
- "eval_length_ratio": 1.0,
344
- "eval_loss": NaN,
345
- "eval_precisions": [
346
- 1.0,
347
- 1.0,
348
- 1.0,
349
- 1.0
350
- ],
351
- "eval_reference_length": 18771,
352
- "eval_runtime": 122.6275,
353
- "eval_samples_per_second": 8.383,
354
- "eval_steps_per_second": 1.052,
355
- "eval_translation_length": 18771,
356
- "step": 2458
357
- },
358
- {
359
- "epoch": 17.29,
360
- "learning_rate": 1e-05,
361
- "loss": 0.0,
362
- "step": 2500
363
- },
364
- {
365
- "epoch": 18.0,
366
- "eval_bleu": 1.0,
367
- "eval_brevity_penalty": 1.0,
368
- "eval_length_ratio": 1.0,
369
- "eval_loss": NaN,
370
- "eval_precisions": [
371
- 1.0,
372
- 1.0,
373
- 1.0,
374
- 1.0
375
- ],
376
- "eval_reference_length": 18771,
377
- "eval_runtime": 122.8755,
378
- "eval_samples_per_second": 8.366,
379
- "eval_steps_per_second": 1.05,
380
- "eval_translation_length": 18771,
381
- "step": 2603
382
- },
383
- {
384
- "epoch": 18.99,
385
- "eval_bleu": 1.0,
386
- "eval_brevity_penalty": 1.0,
387
- "eval_length_ratio": 1.0,
388
- "eval_loss": NaN,
389
- "eval_precisions": [
390
- 1.0,
391
- 1.0,
392
- 1.0,
393
- 1.0
394
- ],
395
- "eval_reference_length": 18771,
396
- "eval_runtime": 122.518,
397
- "eval_samples_per_second": 8.391,
398
- "eval_steps_per_second": 1.053,
399
- "eval_translation_length": 18771,
400
- "step": 2747
401
- },
402
- {
403
- "epoch": 19.91,
404
  "eval_bleu": 1.0,
405
  "eval_brevity_penalty": 1.0,
406
  "eval_length_ratio": 1.0,
407
- "eval_loss": NaN,
408
  "eval_precisions": [
409
  1.0,
410
  1.0,
411
  1.0,
412
  1.0
413
  ],
414
- "eval_reference_length": 18771,
415
- "eval_runtime": 122.5437,
416
- "eval_samples_per_second": 8.389,
417
- "eval_steps_per_second": 1.053,
418
- "eval_translation_length": 18771,
419
- "step": 2880
420
  }
421
  ],
422
  "logging_steps": 500,
423
- "max_steps": 2880,
424
- "num_train_epochs": 20,
425
  "save_steps": 500,
426
- "total_flos": 1.2616243364442931e+17,
427
  "trial_name": null,
428
  "trial_params": null
429
  }
 
1
  {
2
+ "best_metric": 2.3185157775878906,
3
+ "best_model_checkpoint": "dq158/morbius/checkpoint-859",
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 859,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.58,
13
+ "learning_rate": 9e-05,
14
+ "loss": 3.0453,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 1.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "eval_bleu": 1.0,
20
  "eval_brevity_penalty": 1.0,
21
  "eval_length_ratio": 1.0,
22
+ "eval_loss": 2.3185157775878906,
23
  "eval_precisions": [
24
  1.0,
25
  1.0,
26
  1.0,
27
  1.0
28
  ],
29
+ "eval_reference_length": 55790,
30
+ "eval_runtime": 652.4974,
31
+ "eval_samples_per_second": 4.68,
32
+ "eval_steps_per_second": 0.585,
33
+ "eval_translation_length": 55790,
34
+ "step": 859
35
  }
36
  ],
37
  "logging_steps": 500,
38
+ "max_steps": 25770,
39
+ "num_train_epochs": 30,
40
  "save_steps": 500,
41
+ "total_flos": 1.882124058938573e+16,
42
  "trial_name": null,
43
  "trial_params": null
44
  }
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e101b52461267c553f7820ea490d90f0c52a8eee984bb696ad906e06ac49bd9a
3
  size 4664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf0ee8131b75f80a4c15c9cd2f09dee6cc56de3d84cc9777afd07193cb1c1678
3
  size 4664