ChiefTheLord commited on
Commit
d729989
·
verified ·
1 Parent(s): c064ada

Upload folder using huggingface_hub

Browse files
checkpoints/checkpoint-408/adapter.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaf5abcb61d1492002aac91a16ea3bdecba80ccfa546de2e84418b8fa0168721
3
+ size 6439640
checkpoints/checkpoint-408/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8daf7c41f9dfd692fa1bc1d86914b7c497e2c39d1891b6c0e352de5ab5ff173
3
+ size 3304962
checkpoints/checkpoint-408/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ed0adb262ae5a32b0426ae5f447cd38d4498b7741803b88609e5036322fb8d3
3
+ size 14244
checkpoints/checkpoint-408/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8c0d01d51244373dd2dd86b408e09445988f26a5dd6fa08072fd5289a38c0ab
3
+ size 1064
checkpoints/checkpoint-408/trainer_state.json ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.0,
5
+ "eval_steps": 500,
6
+ "global_step": 408,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.3137254901960784,
13
+ "grad_norm": 2.9848222732543945,
14
+ "learning_rate": 1.4634146341463413e-07,
15
+ "loss": 9.4214,
16
+ "step": 16
17
+ },
18
+ {
19
+ "epoch": 0.6274509803921569,
20
+ "grad_norm": 3.9102113246917725,
21
+ "learning_rate": 3.4146341463414634e-07,
22
+ "loss": 9.4215,
23
+ "step": 32
24
+ },
25
+ {
26
+ "epoch": 0.9411764705882353,
27
+ "grad_norm": 3.406308174133301,
28
+ "learning_rate": 5.365853658536586e-07,
29
+ "loss": 9.421,
30
+ "step": 48
31
+ },
32
+ {
33
+ "epoch": 1.0,
34
+ "eval_bleu": 0.27230328735060844,
35
+ "eval_cap_loss": 2.1642436525400948,
36
+ "eval_con_loss": 2.0591190656026206,
37
+ "eval_loss": 5.200359978512222,
38
+ "step": 51
39
+ },
40
+ {
41
+ "epoch": 1.0,
42
+ "eval_bleu": 0.27230328735060844,
43
+ "eval_cap_loss": 2.1642436525400948,
44
+ "eval_con_loss": 2.0591190656026206,
45
+ "eval_loss": 5.200359978512222,
46
+ "eval_runtime": 30.9761,
47
+ "eval_samples_per_second": 26.117,
48
+ "eval_steps_per_second": 3.293,
49
+ "step": 51
50
+ },
51
+ {
52
+ "epoch": 1.2549019607843137,
53
+ "grad_norm": 2.83813738822937,
54
+ "learning_rate": 7.317073170731707e-07,
55
+ "loss": 9.2807,
56
+ "step": 64
57
+ },
58
+ {
59
+ "epoch": 1.5686274509803921,
60
+ "grad_norm": 2.936375379562378,
61
+ "learning_rate": 9.146341463414634e-07,
62
+ "loss": 9.406,
63
+ "step": 80
64
+ },
65
+ {
66
+ "epoch": 1.8823529411764706,
67
+ "grad_norm": 3.668606996536255,
68
+ "learning_rate": 9.981206082534287e-07,
69
+ "loss": 9.3986,
70
+ "step": 96
71
+ },
72
+ {
73
+ "epoch": 2.0,
74
+ "eval_bleu": 0.27320616813251447,
75
+ "eval_cap_loss": 2.1230438223072126,
76
+ "eval_con_loss": 2.059116809975867,
77
+ "eval_loss": 5.179755524677389,
78
+ "step": 102
79
+ },
80
+ {
81
+ "epoch": 2.0,
82
+ "eval_bleu": 0.27320616813251447,
83
+ "eval_cap_loss": 2.1230438223072126,
84
+ "eval_con_loss": 2.059116809975867,
85
+ "eval_loss": 5.179755524677389,
86
+ "eval_runtime": 31.0,
87
+ "eval_samples_per_second": 26.097,
88
+ "eval_steps_per_second": 3.29,
89
+ "step": 102
90
+ },
91
+ {
92
+ "epoch": 2.196078431372549,
93
+ "grad_norm": 2.896123170852661,
94
+ "learning_rate": 9.855594766321122e-07,
95
+ "loss": 9.2709,
96
+ "step": 112
97
+ },
98
+ {
99
+ "epoch": 2.5098039215686274,
100
+ "grad_norm": 2.431732177734375,
101
+ "learning_rate": 9.614774462458572e-07,
102
+ "loss": 9.3793,
103
+ "step": 128
104
+ },
105
+ {
106
+ "epoch": 2.8235294117647056,
107
+ "grad_norm": 2.8162143230438232,
108
+ "learning_rate": 9.264459128710549e-07,
109
+ "loss": 9.382,
110
+ "step": 144
111
+ },
112
+ {
113
+ "epoch": 3.0,
114
+ "eval_bleu": 0.27320213706102187,
115
+ "eval_cap_loss": 2.082164978279787,
116
+ "eval_con_loss": 2.0591136591107237,
117
+ "eval_loss": 5.159309838332382,
118
+ "step": 153
119
+ },
120
+ {
121
+ "epoch": 3.0,
122
+ "eval_bleu": 0.27320213706102187,
123
+ "eval_cap_loss": 2.082164978279787,
124
+ "eval_con_loss": 2.0591136591107237,
125
+ "eval_loss": 5.159309838332382,
126
+ "eval_runtime": 31.2438,
127
+ "eval_samples_per_second": 25.893,
128
+ "eval_steps_per_second": 3.265,
129
+ "step": 153
130
+ },
131
+ {
132
+ "epoch": 3.1372549019607843,
133
+ "grad_norm": 2.2468996047973633,
134
+ "learning_rate": 8.812960717968818e-07,
135
+ "loss": 9.2376,
136
+ "step": 160
137
+ },
138
+ {
139
+ "epoch": 3.450980392156863,
140
+ "grad_norm": 2.2097907066345215,
141
+ "learning_rate": 8.27099196004923e-07,
142
+ "loss": 9.3674,
143
+ "step": 176
144
+ },
145
+ {
146
+ "epoch": 3.764705882352941,
147
+ "grad_norm": 2.2787301540374756,
148
+ "learning_rate": 7.651412180110175e-07,
149
+ "loss": 9.3575,
150
+ "step": 192
151
+ },
152
+ {
153
+ "epoch": 4.0,
154
+ "eval_bleu": 0.27339830557499767,
155
+ "eval_cap_loss": 2.052475500340555,
156
+ "eval_con_loss": 2.059111627877927,
157
+ "eval_loss": 5.144460982551761,
158
+ "step": 204
159
+ },
160
+ {
161
+ "epoch": 4.0,
162
+ "eval_bleu": 0.27339830557499767,
163
+ "eval_cap_loss": 2.052475500340555,
164
+ "eval_con_loss": 2.059111627877927,
165
+ "eval_loss": 5.144460982551761,
166
+ "eval_runtime": 31.4567,
167
+ "eval_samples_per_second": 25.718,
168
+ "eval_steps_per_second": 3.243,
169
+ "step": 204
170
+ },
171
+ {
172
+ "epoch": 4.078431372549019,
173
+ "grad_norm": 2.41097092628479,
174
+ "learning_rate": 6.968922184674867e-07,
175
+ "loss": 9.2198,
176
+ "step": 208
177
+ },
178
+ {
179
+ "epoch": 4.392156862745098,
180
+ "grad_norm": 2.1482274532318115,
181
+ "learning_rate": 6.239715454715053e-07,
182
+ "loss": 9.3524,
183
+ "step": 224
184
+ },
185
+ {
186
+ "epoch": 4.705882352941177,
187
+ "grad_norm": 4.837054252624512,
188
+ "learning_rate": 5.481093921958749e-07,
189
+ "loss": 9.3413,
190
+ "step": 240
191
+ },
192
+ {
193
+ "epoch": 5.0,
194
+ "eval_bleu": 0.27362478077654184,
195
+ "eval_cap_loss": 2.0312489189353644,
196
+ "eval_con_loss": 2.059109935573503,
197
+ "eval_loss": 5.133844311330833,
198
+ "step": 255
199
+ },
200
+ {
201
+ "epoch": 5.0,
202
+ "eval_bleu": 0.27362478077654184,
203
+ "eval_cap_loss": 2.0312489189353644,
204
+ "eval_con_loss": 2.059109935573503,
205
+ "eval_loss": 5.133844311330833,
206
+ "eval_runtime": 30.935,
207
+ "eval_samples_per_second": 26.152,
208
+ "eval_steps_per_second": 3.297,
209
+ "step": 255
210
+ },
211
+ {
212
+ "epoch": 5.019607843137255,
213
+ "grad_norm": 2.2715868949890137,
214
+ "learning_rate": 4.7110574449205214e-07,
215
+ "loss": 9.2202,
216
+ "step": 256
217
+ },
218
+ {
219
+ "epoch": 5.333333333333333,
220
+ "grad_norm": 2.1526544094085693,
221
+ "learning_rate": 3.9478767251811595e-07,
222
+ "loss": 9.3375,
223
+ "step": 272
224
+ },
225
+ {
226
+ "epoch": 5.647058823529412,
227
+ "grad_norm": 1.8560357093811035,
228
+ "learning_rate": 3.209659797357669e-07,
229
+ "loss": 9.338,
230
+ "step": 288
231
+ },
232
+ {
233
+ "epoch": 5.96078431372549,
234
+ "grad_norm": 1.7859069108963013,
235
+ "learning_rate": 2.5139223786820744e-07,
236
+ "loss": 9.3379,
237
+ "step": 304
238
+ },
239
+ {
240
+ "epoch": 6.0,
241
+ "eval_bleu": 0.27264795125070107,
242
+ "eval_cap_loss": 2.0194136815912582,
243
+ "eval_con_loss": 2.0591090099484313,
244
+ "eval_loss": 5.127924858355055,
245
+ "step": 306
246
+ },
247
+ {
248
+ "epoch": 6.0,
249
+ "eval_bleu": 0.27264795125070107,
250
+ "eval_cap_loss": 2.0194136815912582,
251
+ "eval_con_loss": 2.0591090099484313,
252
+ "eval_loss": 5.127924858355055,
253
+ "eval_runtime": 31.0894,
254
+ "eval_samples_per_second": 26.022,
255
+ "eval_steps_per_second": 3.281,
256
+ "step": 306
257
+ },
258
+ {
259
+ "epoch": 6.2745098039215685,
260
+ "grad_norm": 2.1876094341278076,
261
+ "learning_rate": 1.877172272530264e-07,
262
+ "loss": 9.2022,
263
+ "step": 320
264
+ },
265
+ {
266
+ "epoch": 6.588235294117647,
267
+ "grad_norm": 2.331590414047241,
268
+ "learning_rate": 1.3145176867832165e-07,
269
+ "loss": 9.3296,
270
+ "step": 336
271
+ },
272
+ {
273
+ "epoch": 6.901960784313726,
274
+ "grad_norm": 2.335200309753418,
275
+ "learning_rate": 8.393087604743283e-08,
276
+ "loss": 9.337,
277
+ "step": 352
278
+ },
279
+ {
280
+ "epoch": 7.0,
281
+ "eval_bleu": 0.272648714102529,
282
+ "eval_cap_loss": 2.0149140498217415,
283
+ "eval_con_loss": 2.059108528436399,
284
+ "eval_loss": 5.125674068343406,
285
+ "step": 357
286
+ },
287
+ {
288
+ "epoch": 7.0,
289
+ "eval_bleu": 0.272648714102529,
290
+ "eval_cap_loss": 2.0149140498217415,
291
+ "eval_con_loss": 2.059108528436399,
292
+ "eval_loss": 5.125674068343406,
293
+ "eval_runtime": 31.305,
294
+ "eval_samples_per_second": 25.843,
295
+ "eval_steps_per_second": 3.258,
296
+ "step": 357
297
+ },
298
+ {
299
+ "epoch": 7.215686274509804,
300
+ "grad_norm": 2.2809042930603027,
301
+ "learning_rate": 4.6282080424148886e-08,
302
+ "loss": 9.2029,
303
+ "step": 368
304
+ },
305
+ {
306
+ "epoch": 7.529411764705882,
307
+ "grad_norm": 1.9819755554199219,
308
+ "learning_rate": 1.9398677035671218e-08,
309
+ "loss": 9.3332,
310
+ "step": 384
311
+ },
312
+ {
313
+ "epoch": 7.8431372549019605,
314
+ "grad_norm": 2.1756012439727783,
315
+ "learning_rate": 3.9185300032889e-09,
316
+ "loss": 9.3308,
317
+ "step": 400
318
+ },
319
+ {
320
+ "epoch": 8.0,
321
+ "eval_bleu": 0.2724553369227519,
322
+ "eval_cap_loss": 2.01423061244628,
323
+ "eval_con_loss": 2.0591084513009763,
324
+ "eval_loss": 5.125332222265356,
325
+ "step": 408
326
+ },
327
+ {
328
+ "epoch": 8.0,
329
+ "eval_bleu": 0.2724553369227519,
330
+ "eval_cap_loss": 2.01423061244628,
331
+ "eval_con_loss": 2.0591084513009763,
332
+ "eval_loss": 5.125332222265356,
333
+ "eval_runtime": 31.0026,
334
+ "eval_samples_per_second": 26.095,
335
+ "eval_steps_per_second": 3.29,
336
+ "step": 408
337
+ }
338
+ ],
339
+ "logging_steps": 16,
340
+ "max_steps": 408,
341
+ "num_input_tokens_seen": 0,
342
+ "num_train_epochs": 8,
343
+ "save_steps": 500,
344
+ "stateful_callbacks": {
345
+ "TrainerControl": {
346
+ "args": {
347
+ "should_epoch_stop": false,
348
+ "should_evaluate": false,
349
+ "should_log": false,
350
+ "should_save": true,
351
+ "should_training_stop": true
352
+ },
353
+ "attributes": {}
354
+ }
355
+ },
356
+ "total_flos": 0.0,
357
+ "train_batch_size": 64,
358
+ "trial_name": null,
359
+ "trial_params": null
360
+ }