juselara commited on
Commit
e0641d7
·
1 Parent(s): 4737d5b

Trained model with more epochs.

Browse files
Files changed (6) hide show
  1. optimizer.pt +1 -1
  2. pytorch_model.bin +1 -1
  3. rng_state.pth +1 -1
  4. scheduler.pt +1 -1
  5. trainer_state.json +149 -205
  6. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7821d0cbfa96cb9e3dc1a9d11c3798188549e0b60e98b2bf452a78a21fe9a701
3
  size 995605189
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8130dcd44bc26bc493210590186970609afda402c8a8c30d783b6c38ce3b369
3
  size 995605189
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:87039797eeee31fa3a1d10c2d523177194c55dc59f61f4e681b323a3ff27e3ac
3
  size 497807197
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e338a12b8bebe4e2678f3e2df6bb4349cf4c19e37d8577fff4193457b40c2a63
3
  size 497807197
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46ce6d1e3d7df661bdec831f2ccc0a92148db2766e8bae2d96b45ecc70cfa48d
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e8b05db3db8612d55f664401de73515ed39d7840e12323f51fcce11ead5d8ac
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8015d3b05a1d6e704db23f7bbf89c9b1f626d36f6bdc05c31adde13e38374441
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3da2be91dd0fb10ee5c14ceb065a3e14a48dc06697808fe40026425eebb46709
3
  size 627
trainer_state.json CHANGED
@@ -1,324 +1,268 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.839419269687637,
5
- "global_step": 11000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.22,
12
- "learning_rate": 2.5e-05,
13
- "loss": 7.2786,
14
  "step": 500
15
  },
16
  {
17
- "epoch": 0.22,
18
- "eval_loss": 4.886257171630859,
19
- "eval_runtime": 81.5985,
20
- "eval_samples_per_second": 190.972,
21
- "eval_steps_per_second": 11.936,
22
  "step": 500
23
  },
24
  {
25
- "epoch": 0.44,
26
- "learning_rate": 5e-05,
27
- "loss": 4.3573,
28
  "step": 1000
29
  },
30
  {
31
- "epoch": 0.44,
32
- "eval_loss": 3.8646771907806396,
33
- "eval_runtime": 81.3815,
34
- "eval_samples_per_second": 191.481,
35
- "eval_steps_per_second": 11.968,
36
  "step": 1000
37
  },
38
  {
39
- "epoch": 0.66,
40
- "learning_rate": 4.758803666184274e-05,
41
- "loss": 3.6885,
42
  "step": 1500
43
  },
44
  {
45
- "epoch": 0.66,
46
- "eval_loss": 3.40698504447937,
47
- "eval_runtime": 81.4183,
48
- "eval_samples_per_second": 191.394,
49
- "eval_steps_per_second": 11.963,
50
  "step": 1500
51
  },
52
  {
53
- "epoch": 0.88,
54
- "learning_rate": 4.517607332368548e-05,
55
- "loss": 3.3144,
56
  "step": 2000
57
  },
58
  {
59
- "epoch": 0.88,
60
- "eval_loss": 3.0980827808380127,
61
- "eval_runtime": 81.5071,
62
- "eval_samples_per_second": 191.186,
63
- "eval_steps_per_second": 11.95,
64
  "step": 2000
65
  },
66
  {
67
- "epoch": 1.1,
68
- "learning_rate": 4.2764109985528225e-05,
69
- "loss": 3.0218,
70
  "step": 2500
71
  },
72
  {
73
- "epoch": 1.1,
74
- "eval_loss": 2.8812012672424316,
75
- "eval_runtime": 81.6765,
76
- "eval_samples_per_second": 190.789,
77
- "eval_steps_per_second": 11.925,
78
  "step": 2500
79
  },
80
  {
81
- "epoch": 1.32,
82
- "learning_rate": 4.0352146647370966e-05,
83
- "loss": 2.832,
84
  "step": 3000
85
  },
86
  {
87
- "epoch": 1.32,
88
- "eval_loss": 2.730829954147339,
89
- "eval_runtime": 81.679,
90
- "eval_samples_per_second": 190.783,
91
- "eval_steps_per_second": 11.925,
92
  "step": 3000
93
  },
94
  {
95
- "epoch": 1.54,
96
- "learning_rate": 3.79401833092137e-05,
97
- "loss": 2.7159,
98
  "step": 3500
99
  },
100
  {
101
- "epoch": 1.54,
102
- "eval_loss": 2.6153135299682617,
103
- "eval_runtime": 81.5642,
104
- "eval_samples_per_second": 191.052,
105
- "eval_steps_per_second": 11.942,
106
  "step": 3500
107
  },
108
  {
109
- "epoch": 1.76,
110
- "learning_rate": 3.552821997105644e-05,
111
- "loss": 2.6086,
112
  "step": 4000
113
  },
114
  {
115
- "epoch": 1.76,
116
- "eval_loss": 2.522780418395996,
117
- "eval_runtime": 81.5345,
118
- "eval_samples_per_second": 191.122,
119
- "eval_steps_per_second": 11.946,
120
  "step": 4000
121
  },
122
  {
123
- "epoch": 1.98,
124
- "learning_rate": 3.311625663289918e-05,
125
- "loss": 2.52,
126
  "step": 4500
127
  },
128
  {
129
- "epoch": 1.98,
130
- "eval_loss": 2.446049451828003,
131
- "eval_runtime": 81.2437,
132
- "eval_samples_per_second": 191.806,
133
- "eval_steps_per_second": 11.989,
134
  "step": 4500
135
  },
136
  {
137
- "epoch": 2.2,
138
- "learning_rate": 3.0704293294741923e-05,
139
- "loss": 2.3734,
140
  "step": 5000
141
  },
142
  {
143
- "epoch": 2.2,
144
- "eval_loss": 2.3772902488708496,
145
- "eval_runtime": 81.4154,
146
- "eval_samples_per_second": 191.401,
147
- "eval_steps_per_second": 11.963,
148
  "step": 5000
149
  },
150
  {
151
- "epoch": 2.42,
152
- "learning_rate": 2.829232995658466e-05,
153
- "loss": 2.3257,
154
  "step": 5500
155
  },
156
  {
157
- "epoch": 2.42,
158
- "eval_loss": 2.322295904159546,
159
- "eval_runtime": 81.2639,
160
- "eval_samples_per_second": 191.758,
161
- "eval_steps_per_second": 11.986,
162
  "step": 5500
163
  },
164
  {
165
- "epoch": 2.64,
166
- "learning_rate": 2.58803666184274e-05,
167
- "loss": 2.2788,
168
  "step": 6000
169
  },
170
  {
171
- "epoch": 2.64,
172
- "eval_loss": 2.274811267852783,
173
- "eval_runtime": 81.4351,
174
- "eval_samples_per_second": 191.355,
175
- "eval_steps_per_second": 11.96,
176
  "step": 6000
177
  },
178
  {
179
- "epoch": 2.86,
180
- "learning_rate": 2.346840328027014e-05,
181
- "loss": 2.2449,
182
  "step": 6500
183
  },
184
  {
185
- "epoch": 2.86,
186
- "eval_loss": 2.233612298965454,
187
- "eval_runtime": 81.4361,
188
- "eval_samples_per_second": 191.352,
189
- "eval_steps_per_second": 11.96,
190
  "step": 6500
191
  },
192
  {
193
- "epoch": 3.08,
194
- "learning_rate": 2.105643994211288e-05,
195
- "loss": 2.1734,
196
  "step": 7000
197
  },
198
  {
199
- "epoch": 3.08,
200
- "eval_loss": 2.195352077484131,
201
- "eval_runtime": 81.4409,
202
- "eval_samples_per_second": 191.341,
203
- "eval_steps_per_second": 11.96,
204
  "step": 7000
205
  },
206
  {
207
- "epoch": 3.3,
208
- "learning_rate": 1.864447660395562e-05,
209
- "loss": 2.088,
210
  "step": 7500
211
  },
212
  {
213
- "epoch": 3.3,
214
- "eval_loss": 2.172189950942993,
215
- "eval_runtime": 81.2625,
216
- "eval_samples_per_second": 191.761,
217
- "eval_steps_per_second": 11.986,
218
  "step": 7500
219
  },
220
  {
221
- "epoch": 3.52,
222
- "learning_rate": 1.623251326579836e-05,
223
- "loss": 2.0633,
224
  "step": 8000
225
  },
226
  {
227
- "epoch": 3.52,
228
- "eval_loss": 2.1411869525909424,
229
- "eval_runtime": 81.2032,
230
- "eval_samples_per_second": 191.901,
231
- "eval_steps_per_second": 11.995,
232
  "step": 8000
233
  },
234
  {
235
- "epoch": 3.74,
236
- "learning_rate": 1.3825373854317415e-05,
237
- "loss": 2.0491,
238
  "step": 8500
239
  },
240
  {
241
- "epoch": 3.74,
242
- "eval_loss": 2.117720127105713,
243
- "eval_runtime": 81.4728,
244
- "eval_samples_per_second": 191.266,
245
- "eval_steps_per_second": 11.955,
246
  "step": 8500
247
  },
248
  {
249
- "epoch": 3.96,
250
- "learning_rate": 1.1413410516160154e-05,
251
- "loss": 2.032,
252
  "step": 9000
253
  },
254
  {
255
- "epoch": 3.96,
256
- "eval_loss": 2.0970561504364014,
257
- "eval_runtime": 81.4071,
258
- "eval_samples_per_second": 191.421,
259
- "eval_steps_per_second": 11.965,
260
  "step": 9000
261
- },
262
- {
263
- "epoch": 4.18,
264
- "learning_rate": 9.001447178002895e-06,
265
- "loss": 1.9451,
266
- "step": 9500
267
- },
268
- {
269
- "epoch": 4.18,
270
- "eval_loss": 2.084089756011963,
271
- "eval_runtime": 81.4657,
272
- "eval_samples_per_second": 191.283,
273
- "eval_steps_per_second": 11.956,
274
- "step": 9500
275
- },
276
- {
277
- "epoch": 4.4,
278
- "learning_rate": 6.589483839845635e-06,
279
- "loss": 1.9275,
280
- "step": 10000
281
- },
282
- {
283
- "epoch": 4.4,
284
- "eval_loss": 2.0687994956970215,
285
- "eval_runtime": 81.6182,
286
- "eval_samples_per_second": 190.926,
287
- "eval_steps_per_second": 11.934,
288
- "step": 10000
289
- },
290
- {
291
- "epoch": 4.62,
292
- "learning_rate": 4.18234442836469e-06,
293
- "loss": 1.9138,
294
- "step": 10500
295
- },
296
- {
297
- "epoch": 4.62,
298
- "eval_loss": 2.0576138496398926,
299
- "eval_runtime": 81.6062,
300
- "eval_samples_per_second": 190.954,
301
- "eval_steps_per_second": 11.935,
302
- "step": 10500
303
- },
304
- {
305
- "epoch": 4.84,
306
- "learning_rate": 1.7703810902074289e-06,
307
- "loss": 1.9068,
308
- "step": 11000
309
- },
310
- {
311
- "epoch": 4.84,
312
- "eval_loss": 2.048520565032959,
313
- "eval_runtime": 81.6161,
314
- "eval_samples_per_second": 190.93,
315
- "eval_steps_per_second": 11.934,
316
- "step": 11000
317
  }
318
  ],
319
- "max_steps": 11365,
320
- "num_train_epochs": 5,
321
- "total_flos": 1.1494473283584e+16,
322
  "trial_name": null,
323
  "trial_params": null
324
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 7.915567282321899,
5
+ "global_step": 9000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.44,
12
+ "learning_rate": 5e-05,
13
+ "loss": 6.3746,
14
  "step": 500
15
  },
16
  {
17
+ "epoch": 0.44,
18
+ "eval_loss": 4.111661911010742,
19
+ "eval_runtime": 79.373,
20
+ "eval_samples_per_second": 196.326,
21
+ "eval_steps_per_second": 6.136,
22
  "step": 500
23
  },
24
  {
25
+ "epoch": 0.88,
26
+ "learning_rate": 0.0001,
27
+ "loss": 3.696,
28
  "step": 1000
29
  },
30
  {
31
+ "epoch": 0.88,
32
+ "eval_loss": 3.2242989540100098,
33
+ "eval_runtime": 79.3675,
34
+ "eval_samples_per_second": 196.34,
35
+ "eval_steps_per_second": 6.136,
36
  "step": 1000
37
  },
38
  {
39
+ "epoch": 1.32,
40
+ "learning_rate": 9.382411067193676e-05,
41
+ "loss": 2.9752,
42
  "step": 1500
43
  },
44
  {
45
+ "epoch": 1.32,
46
+ "eval_loss": 2.7193033695220947,
47
+ "eval_runtime": 79.2691,
48
+ "eval_samples_per_second": 196.584,
49
+ "eval_steps_per_second": 6.144,
50
  "step": 1500
51
  },
52
  {
53
+ "epoch": 1.76,
54
+ "learning_rate": 8.764822134387353e-05,
55
+ "loss": 2.6145,
56
  "step": 2000
57
  },
58
  {
59
+ "epoch": 1.76,
60
+ "eval_loss": 2.45035457611084,
61
+ "eval_runtime": 79.2794,
62
+ "eval_samples_per_second": 196.558,
63
+ "eval_steps_per_second": 6.143,
64
  "step": 2000
65
  },
66
  {
67
+ "epoch": 2.2,
68
+ "learning_rate": 8.147233201581028e-05,
69
+ "loss": 2.3557,
70
  "step": 2500
71
  },
72
  {
73
+ "epoch": 2.2,
74
+ "eval_loss": 2.2841453552246094,
75
+ "eval_runtime": 79.2481,
76
+ "eval_samples_per_second": 196.636,
77
+ "eval_steps_per_second": 6.145,
78
  "step": 2500
79
  },
80
  {
81
+ "epoch": 2.64,
82
+ "learning_rate": 7.529644268774703e-05,
83
+ "loss": 2.2026,
84
  "step": 3000
85
  },
86
  {
87
+ "epoch": 2.64,
88
+ "eval_loss": 2.173915386199951,
89
+ "eval_runtime": 79.3117,
90
+ "eval_samples_per_second": 196.478,
91
+ "eval_steps_per_second": 6.14,
92
  "step": 3000
93
  },
94
  {
95
+ "epoch": 3.08,
96
+ "learning_rate": 6.912055335968379e-05,
97
+ "loss": 2.0825,
98
  "step": 3500
99
  },
100
  {
101
+ "epoch": 3.08,
102
+ "eval_loss": 2.0795915126800537,
103
+ "eval_runtime": 79.1878,
104
+ "eval_samples_per_second": 196.785,
105
+ "eval_steps_per_second": 6.15,
106
  "step": 3500
107
  },
108
  {
109
+ "epoch": 3.52,
110
+ "learning_rate": 6.294466403162056e-05,
111
+ "loss": 1.9298,
112
  "step": 4000
113
  },
114
  {
115
+ "epoch": 3.52,
116
+ "eval_loss": 2.0210442543029785,
117
+ "eval_runtime": 79.2843,
118
+ "eval_samples_per_second": 196.546,
119
+ "eval_steps_per_second": 6.142,
120
  "step": 4000
121
  },
122
  {
123
+ "epoch": 3.96,
124
+ "learning_rate": 5.6768774703557316e-05,
125
+ "loss": 1.8936,
126
  "step": 4500
127
  },
128
  {
129
+ "epoch": 3.96,
130
+ "eval_loss": 1.966531753540039,
131
+ "eval_runtime": 79.2784,
132
+ "eval_samples_per_second": 196.561,
133
+ "eval_steps_per_second": 6.143,
134
  "step": 4500
135
  },
136
  {
137
+ "epoch": 4.4,
138
+ "learning_rate": 5.059288537549407e-05,
139
+ "loss": 1.746,
140
  "step": 5000
141
  },
142
  {
143
+ "epoch": 4.4,
144
+ "eval_loss": 1.9314507246017456,
145
+ "eval_runtime": 79.2451,
146
+ "eval_samples_per_second": 196.643,
147
+ "eval_steps_per_second": 6.145,
148
  "step": 5000
149
  },
150
  {
151
+ "epoch": 4.84,
152
+ "learning_rate": 4.441699604743083e-05,
153
+ "loss": 1.717,
154
  "step": 5500
155
  },
156
  {
157
+ "epoch": 4.84,
158
+ "eval_loss": 1.8987737894058228,
159
+ "eval_runtime": 79.2498,
160
+ "eval_samples_per_second": 196.631,
161
+ "eval_steps_per_second": 6.145,
162
  "step": 5500
163
  },
164
  {
165
+ "epoch": 5.28,
166
+ "learning_rate": 3.824110671936759e-05,
167
+ "loss": 1.6156,
168
  "step": 6000
169
  },
170
  {
171
+ "epoch": 5.28,
172
+ "eval_loss": 1.8819612264633179,
173
+ "eval_runtime": 79.3489,
174
+ "eval_samples_per_second": 196.386,
175
+ "eval_steps_per_second": 6.137,
176
  "step": 6000
177
  },
178
  {
179
+ "epoch": 5.72,
180
+ "learning_rate": 3.2065217391304345e-05,
181
+ "loss": 1.5674,
182
  "step": 6500
183
  },
184
  {
185
+ "epoch": 5.72,
186
+ "eval_loss": 1.8578243255615234,
187
+ "eval_runtime": 79.2896,
188
+ "eval_samples_per_second": 196.533,
189
+ "eval_steps_per_second": 6.142,
190
  "step": 6500
191
  },
192
  {
193
+ "epoch": 6.16,
194
+ "learning_rate": 2.588932806324111e-05,
195
+ "loss": 1.5229,
196
  "step": 7000
197
  },
198
  {
199
+ "epoch": 6.16,
200
+ "eval_loss": 1.848434567451477,
201
+ "eval_runtime": 79.2658,
202
+ "eval_samples_per_second": 196.592,
203
+ "eval_steps_per_second": 6.144,
204
  "step": 7000
205
  },
206
  {
207
+ "epoch": 6.6,
208
+ "learning_rate": 1.9713438735177866e-05,
209
+ "loss": 1.4411,
210
  "step": 7500
211
  },
212
  {
213
+ "epoch": 6.6,
214
+ "eval_loss": 1.8373626470565796,
215
+ "eval_runtime": 79.267,
216
+ "eval_samples_per_second": 196.589,
217
+ "eval_steps_per_second": 6.144,
218
  "step": 7500
219
  },
220
  {
221
+ "epoch": 7.04,
222
+ "learning_rate": 1.3537549407114625e-05,
223
+ "loss": 1.4288,
224
  "step": 8000
225
  },
226
  {
227
+ "epoch": 7.04,
228
+ "eval_loss": 1.8292683362960815,
229
+ "eval_runtime": 79.3154,
230
+ "eval_samples_per_second": 196.469,
231
+ "eval_steps_per_second": 6.14,
232
  "step": 8000
233
  },
234
  {
235
+ "epoch": 7.48,
236
+ "learning_rate": 7.361660079051384e-06,
237
+ "loss": 1.3433,
238
  "step": 8500
239
  },
240
  {
241
+ "epoch": 7.48,
242
+ "eval_loss": 1.8282458782196045,
243
+ "eval_runtime": 79.2978,
244
+ "eval_samples_per_second": 196.512,
245
+ "eval_steps_per_second": 6.141,
246
  "step": 8500
247
  },
248
  {
249
+ "epoch": 7.92,
250
+ "learning_rate": 1.1857707509881422e-06,
251
+ "loss": 1.3415,
252
  "step": 9000
253
  },
254
  {
255
+ "epoch": 7.92,
256
+ "eval_loss": 1.8225059509277344,
257
+ "eval_runtime": 79.2921,
258
+ "eval_samples_per_second": 196.527,
259
+ "eval_steps_per_second": 6.142,
260
  "step": 9000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
261
  }
262
  ],
263
+ "max_steps": 9096,
264
+ "num_train_epochs": 8,
265
+ "total_flos": 1.88015947776e+16,
266
  "trial_name": null,
267
  "trial_params": null
268
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff4a22b53672fbc7f2d5cc3310f4e17058a7a012b48b927465d691e9f95a218b
3
  size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a43384642115ae1e433db04ff31d33786becdd7d0ea11e8c69f317f3f149bbd
3
  size 3963