mrplants commited on
Commit
be82c52
·
verified ·
1 Parent(s): b52a107

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59f68e78e982909d4e878fb37ffc83266760fb6dae40dac940b403410ce90a97
3
  size 2536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:714661aaef8e2af2d7ec7d69e1adb728bae869801f07397b14edb899af2ee9a2
3
  size 2536
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95b6047bd8cc6f4cdf7c46dea47edb8e542435510070c6cd1e0a7d9ccf5fd7da
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:435c6f28df4d46a1bb36792295b64bf8fda402d0bd8eeee291d4535762bfc591
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4933c471fb1e4ba81de00146ddd721361901645c866fd1d76662b6837ae85d16
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87f68093f64fb3790e94fd47e9bf45be6d11c09381e54b12e7194571e6bc1ba5
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 500,
6
- "global_step": 894,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -11,271 +11,179 @@
11
  {
12
  "epoch": 0.08389261744966443,
13
  "grad_norm": 0.0,
14
- "learning_rate": 1.851851851851852e-05,
15
  "loss": 4.5636,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.16778523489932887,
20
  "grad_norm": 0.0,
21
- "learning_rate": 1.9965291500546865e-05,
22
  "loss": 4.5267,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.2516778523489933,
27
  "grad_norm": 0.0,
28
- "learning_rate": 1.984912443051131e-05,
29
  "loss": 4.6574,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.33557046979865773,
34
  "grad_norm": 0.0,
35
- "learning_rate": 1.965218883028299e-05,
36
  "loss": 4.589,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.41946308724832215,
41
  "grad_norm": 0.0,
42
- "learning_rate": 1.9376099685953836e-05,
43
  "loss": 4.6256,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.5033557046979866,
48
  "grad_norm": 0.0,
49
- "learning_rate": 1.9023121088565353e-05,
50
  "loss": 4.4971,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.587248322147651,
55
  "grad_norm": 0.0,
56
- "learning_rate": 1.8596147667249457e-05,
57
  "loss": 4.7099,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.6711409395973155,
62
  "grad_norm": 0.0,
63
- "learning_rate": 1.8098680851591538e-05,
64
  "loss": 4.619,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.7550335570469798,
69
  "grad_norm": 0.0,
70
- "learning_rate": 1.753480015787792e-05,
71
  "loss": 4.6121,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.8389261744966443,
76
  "grad_norm": 0.0,
77
- "learning_rate": 1.6909129734697306e-05,
78
  "loss": 4.4854,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.9228187919463087,
83
  "grad_norm": 0.0,
84
- "learning_rate": 1.6226800442241582e-05,
85
  "loss": 4.6153,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 1.0,
90
  "eval_loss": 4.619876861572266,
91
- "eval_runtime": 1.7691,
92
- "eval_samples_per_second": 71.223,
93
- "eval_steps_per_second": 9.044,
94
  "step": 298
95
  },
96
  {
97
  "epoch": 1.0067114093959733,
98
  "grad_norm": 0.0,
99
- "learning_rate": 1.54934077762777e-05,
100
  "loss": 4.6071,
101
  "step": 300
102
  },
103
  {
104
  "epoch": 1.0906040268456376,
105
  "grad_norm": 0.0,
106
- "learning_rate": 1.4714965981838503e-05,
107
  "loss": 4.5226,
108
  "step": 325
109
  },
110
  {
111
  "epoch": 1.174496644295302,
112
  "grad_norm": 0.0,
113
- "learning_rate": 1.3897858732926794e-05,
114
  "loss": 4.4466,
115
  "step": 350
116
  },
117
  {
118
  "epoch": 1.2583892617449663,
119
  "grad_norm": 0.0,
120
- "learning_rate": 1.3048786782687706e-05,
121
  "loss": 4.4967,
122
  "step": 375
123
  },
124
  {
125
  "epoch": 1.342281879194631,
126
  "grad_norm": 0.0,
127
- "learning_rate": 1.2174713013348227e-05,
128
  "loss": 4.7321,
129
  "step": 400
130
  },
131
  {
132
  "epoch": 1.4261744966442953,
133
  "grad_norm": 0.0,
134
- "learning_rate": 1.128280533654637e-05,
135
  "loss": 4.6957,
136
  "step": 425
137
  },
138
  {
139
  "epoch": 1.5100671140939599,
140
  "grad_norm": 0.0,
141
- "learning_rate": 1.0380377912300231e-05,
142
  "loss": 4.5298,
143
  "step": 450
144
  },
145
  {
146
  "epoch": 1.5939597315436242,
147
  "grad_norm": 0.0,
148
- "learning_rate": 9.474831168655596e-06,
149
  "loss": 4.359,
150
  "step": 475
151
  },
152
  {
153
  "epoch": 1.6778523489932886,
154
  "grad_norm": 0.0,
155
- "learning_rate": 8.573591113885695e-06,
156
  "loss": 4.5875,
157
  "step": 500
158
  },
159
  {
160
  "epoch": 1.761744966442953,
161
  "grad_norm": 0.0,
162
- "learning_rate": 7.684048438918247e-06,
163
  "loss": 4.6772,
164
  "step": 525
165
  },
166
  {
167
  "epoch": 1.8456375838926173,
168
  "grad_norm": 0.0,
169
- "learning_rate": 6.813497909385252e-06,
170
  "loss": 4.5338,
171
  "step": 550
172
  },
173
  {
174
  "epoch": 1.929530201342282,
175
  "grad_norm": 0.0,
176
- "learning_rate": 5.969078544315747e-06,
177
  "loss": 4.6363,
178
  "step": 575
179
  },
180
  {
181
  "epoch": 2.0,
182
  "eval_loss": 4.619876861572266,
183
- "eval_runtime": 2.1011,
184
- "eval_samples_per_second": 59.968,
185
- "eval_steps_per_second": 7.615,
186
  "step": 596
187
- },
188
- {
189
- "epoch": 2.0134228187919465,
190
- "grad_norm": 0.0,
191
- "learning_rate": 5.157715072041094e-06,
192
- "loss": 4.6935,
193
- "step": 600
194
- },
195
- {
196
- "epoch": 2.097315436241611,
197
- "grad_norm": 0.0,
198
- "learning_rate": 4.386061143408135e-06,
199
- "loss": 4.5611,
200
- "step": 625
201
- },
202
- {
203
- "epoch": 2.1812080536912752,
204
- "grad_norm": 0.0,
205
- "learning_rate": 3.660444767984911e-06,
206
- "loss": 4.4651,
207
- "step": 650
208
- },
209
- {
210
- "epoch": 2.2651006711409396,
211
- "grad_norm": 0.0,
212
- "learning_rate": 2.986816420713662e-06,
213
- "loss": 4.6059,
214
- "step": 675
215
- },
216
- {
217
- "epoch": 2.348993288590604,
218
- "grad_norm": 0.0,
219
- "learning_rate": 2.370700244566605e-06,
220
- "loss": 4.4503,
221
- "step": 700
222
- },
223
- {
224
- "epoch": 2.4328859060402683,
225
- "grad_norm": 0.0,
226
- "learning_rate": 1.8171487493710337e-06,
227
- "loss": 4.5825,
228
- "step": 725
229
- },
230
- {
231
- "epoch": 2.5167785234899327,
232
- "grad_norm": 0.0,
233
- "learning_rate": 1.3307013782996237e-06,
234
- "loss": 4.6254,
235
- "step": 750
236
- },
237
- {
238
- "epoch": 2.600671140939597,
239
- "grad_norm": 0.0,
240
- "learning_rate": 9.153472818047627e-07,
241
- "loss": 4.6237,
242
- "step": 775
243
- },
244
- {
245
- "epoch": 2.684563758389262,
246
- "grad_norm": 0.0,
247
- "learning_rate": 5.74492604272191e-07,
248
- "loss": 4.4167,
249
- "step": 800
250
- },
251
- {
252
- "epoch": 2.7684563758389262,
253
- "grad_norm": 0.0,
254
- "learning_rate": 3.109325516623818e-07,
255
- "loss": 4.6393,
256
- "step": 825
257
- },
258
- {
259
- "epoch": 2.8523489932885906,
260
- "grad_norm": 0.0,
261
- "learning_rate": 1.2682846920120228e-07,
262
- "loss": 4.6915,
263
- "step": 850
264
- },
265
- {
266
- "epoch": 2.936241610738255,
267
- "grad_norm": 0.0,
268
- "learning_rate": 2.369011709604463e-08,
269
- "loss": 4.7184,
270
- "step": 875
271
- },
272
- {
273
- "epoch": 3.0,
274
- "eval_loss": 4.619876861572266,
275
- "eval_runtime": 1.8875,
276
- "eval_samples_per_second": 66.756,
277
- "eval_steps_per_second": 8.477,
278
- "step": 894
279
  }
280
  ],
281
  "logging_steps": 25,
@@ -290,12 +198,12 @@
290
  "should_evaluate": false,
291
  "should_log": false,
292
  "should_save": true,
293
- "should_training_stop": true
294
  },
295
  "attributes": {}
296
  }
297
  },
298
- "total_flos": 3.4434735820916736e+16,
299
  "train_batch_size": 8,
300
  "trial_name": null,
301
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
  "eval_steps": 500,
6
+ "global_step": 596,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
11
  {
12
  "epoch": 0.08389261744966443,
13
  "grad_norm": 0.0,
14
+ "learning_rate": 0.0001851851851851852,
15
  "loss": 4.5636,
16
  "step": 25
17
  },
18
  {
19
  "epoch": 0.16778523489932887,
20
  "grad_norm": 0.0,
21
+ "learning_rate": 0.00019965291500546864,
22
  "loss": 4.5267,
23
  "step": 50
24
  },
25
  {
26
  "epoch": 0.2516778523489933,
27
  "grad_norm": 0.0,
28
+ "learning_rate": 0.0001984912443051131,
29
  "loss": 4.6574,
30
  "step": 75
31
  },
32
  {
33
  "epoch": 0.33557046979865773,
34
  "grad_norm": 0.0,
35
+ "learning_rate": 0.0001965218883028299,
36
  "loss": 4.589,
37
  "step": 100
38
  },
39
  {
40
  "epoch": 0.41946308724832215,
41
  "grad_norm": 0.0,
42
+ "learning_rate": 0.00019376099685953837,
43
  "loss": 4.6256,
44
  "step": 125
45
  },
46
  {
47
  "epoch": 0.5033557046979866,
48
  "grad_norm": 0.0,
49
+ "learning_rate": 0.00019023121088565352,
50
  "loss": 4.4971,
51
  "step": 150
52
  },
53
  {
54
  "epoch": 0.587248322147651,
55
  "grad_norm": 0.0,
56
+ "learning_rate": 0.00018596147667249458,
57
  "loss": 4.7099,
58
  "step": 175
59
  },
60
  {
61
  "epoch": 0.6711409395973155,
62
  "grad_norm": 0.0,
63
+ "learning_rate": 0.00018098680851591536,
64
  "loss": 4.619,
65
  "step": 200
66
  },
67
  {
68
  "epoch": 0.7550335570469798,
69
  "grad_norm": 0.0,
70
+ "learning_rate": 0.00017534800157877918,
71
  "loss": 4.6121,
72
  "step": 225
73
  },
74
  {
75
  "epoch": 0.8389261744966443,
76
  "grad_norm": 0.0,
77
+ "learning_rate": 0.00016909129734697306,
78
  "loss": 4.4854,
79
  "step": 250
80
  },
81
  {
82
  "epoch": 0.9228187919463087,
83
  "grad_norm": 0.0,
84
+ "learning_rate": 0.0001622680044224158,
85
  "loss": 4.6153,
86
  "step": 275
87
  },
88
  {
89
  "epoch": 1.0,
90
  "eval_loss": 4.619876861572266,
91
+ "eval_runtime": 1.7786,
92
+ "eval_samples_per_second": 70.844,
93
+ "eval_steps_per_second": 8.996,
94
  "step": 298
95
  },
96
  {
97
  "epoch": 1.0067114093959733,
98
  "grad_norm": 0.0,
99
+ "learning_rate": 0.00015493407776277698,
100
  "loss": 4.6071,
101
  "step": 300
102
  },
103
  {
104
  "epoch": 1.0906040268456376,
105
  "grad_norm": 0.0,
106
+ "learning_rate": 0.00014714965981838503,
107
  "loss": 4.5226,
108
  "step": 325
109
  },
110
  {
111
  "epoch": 1.174496644295302,
112
  "grad_norm": 0.0,
113
+ "learning_rate": 0.00013897858732926793,
114
  "loss": 4.4466,
115
  "step": 350
116
  },
117
  {
118
  "epoch": 1.2583892617449663,
119
  "grad_norm": 0.0,
120
+ "learning_rate": 0.00013048786782687705,
121
  "loss": 4.4967,
122
  "step": 375
123
  },
124
  {
125
  "epoch": 1.342281879194631,
126
  "grad_norm": 0.0,
127
+ "learning_rate": 0.00012174713013348226,
128
  "loss": 4.7321,
129
  "step": 400
130
  },
131
  {
132
  "epoch": 1.4261744966442953,
133
  "grad_norm": 0.0,
134
+ "learning_rate": 0.0001128280533654637,
135
  "loss": 4.6957,
136
  "step": 425
137
  },
138
  {
139
  "epoch": 1.5100671140939599,
140
  "grad_norm": 0.0,
141
+ "learning_rate": 0.0001038037791230023,
142
  "loss": 4.5298,
143
  "step": 450
144
  },
145
  {
146
  "epoch": 1.5939597315436242,
147
  "grad_norm": 0.0,
148
+ "learning_rate": 9.474831168655595e-05,
149
  "loss": 4.359,
150
  "step": 475
151
  },
152
  {
153
  "epoch": 1.6778523489932886,
154
  "grad_norm": 0.0,
155
+ "learning_rate": 8.573591113885694e-05,
156
  "loss": 4.5875,
157
  "step": 500
158
  },
159
  {
160
  "epoch": 1.761744966442953,
161
  "grad_norm": 0.0,
162
+ "learning_rate": 7.684048438918248e-05,
163
  "loss": 4.6772,
164
  "step": 525
165
  },
166
  {
167
  "epoch": 1.8456375838926173,
168
  "grad_norm": 0.0,
169
+ "learning_rate": 6.813497909385251e-05,
170
  "loss": 4.5338,
171
  "step": 550
172
  },
173
  {
174
  "epoch": 1.929530201342282,
175
  "grad_norm": 0.0,
176
+ "learning_rate": 5.9690785443157474e-05,
177
  "loss": 4.6363,
178
  "step": 575
179
  },
180
  {
181
  "epoch": 2.0,
182
  "eval_loss": 4.619876861572266,
183
+ "eval_runtime": 1.8399,
184
+ "eval_samples_per_second": 68.481,
185
+ "eval_steps_per_second": 8.696,
186
  "step": 596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  }
188
  ],
189
  "logging_steps": 25,
 
198
  "should_evaluate": false,
199
  "should_log": false,
200
  "should_save": true,
201
+ "should_training_stop": false
202
  },
203
  "attributes": {}
204
  }
205
  },
206
+ "total_flos": 2.298711528064819e+16,
207
  "train_batch_size": 8,
208
  "trial_name": null,
209
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6697f2864903978d31dd6ca1fe39ca44ced565a75f65d198b42ce2ce420093a
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:857c3d3b90faeb2a01802e5fabcb10a4e6cd4564a0b43dbff661f690802b0ddb
3
  size 5368