mgh6 commited on
Commit
655bfe6
·
verified ·
1 Parent(s): aa5ba71

Training in progress, epoch 0, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e62f21c2524fbe6fa35e7771d7a0f174a91d2590b39bd6f6aeb7cdccfd0659c
3
  size 2610104820
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ede89cd7420b342554cf586111ef386bc4803fe1942c8c752e713c75eb639884
3
  size 2610104820
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6bccecaf037aa026fb3aeecf97033588a8b734ad77c6f3316f4d4d4665be6d75
3
  size 5210004271
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a83ac9d504c4f5a9d0b3ec6c9d0ab931281b9695216ad64ae558fdd4e9634d9
3
  size 5210004271
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70145026e21afc6ea2717a18ed89206163fc726fb3040617116c08c85b455de2
3
  size 15006
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9080aea5181d3066ab765d04bc9819f089e9674161d5e56c8bf2b7c839212160
3
  size 15006
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2da52fce13790b5d54928ad82a11cde2bbdaabd941b9375b0d9e259039c539e5
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fd6739d9b468767c726a8685cd4457152a8323bb7a81cf6908a01dd282a18e8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,342 +1,27 @@
1
  {
2
- "best_metric": 0.8495596647262573,
3
- "best_model_checkpoint": "mgh6/HTH_prob/checkpoint-4636",
4
- "epoch": 21.996045413955862,
5
  "eval_steps": 500,
6
- "global_step": 5368,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.9960454139558618,
13
- "grad_norm": 0.2985314428806305,
14
- "learning_rate": 9.833333333333333e-05,
15
- "loss": 1.2681,
16
  "step": 244
17
  },
18
  {
19
  "epoch": 0.9960454139558618,
20
- "eval_loss": 1.157210111618042,
21
- "eval_runtime": 25.4096,
22
- "eval_samples_per_second": 32.507,
23
- "eval_steps_per_second": 16.254,
24
  "step": 244
25
- },
26
- {
27
- "epoch": 1.9960454139558617,
28
- "grad_norm": 0.37684935331344604,
29
- "learning_rate": 9.666666666666667e-05,
30
- "loss": 1.0601,
31
- "step": 488
32
- },
33
- {
34
- "epoch": 1.9960454139558617,
35
- "eval_loss": 1.0078188180923462,
36
- "eval_runtime": 25.3488,
37
- "eval_samples_per_second": 32.585,
38
- "eval_steps_per_second": 16.293,
39
- "step": 488
40
- },
41
- {
42
- "epoch": 2.9960454139558617,
43
- "grad_norm": 0.22615128755569458,
44
- "learning_rate": 9.5e-05,
45
- "loss": 0.9323,
46
- "step": 732
47
- },
48
- {
49
- "epoch": 2.9960454139558617,
50
- "eval_loss": 0.9372721314430237,
51
- "eval_runtime": 25.365,
52
- "eval_samples_per_second": 32.565,
53
- "eval_steps_per_second": 16.282,
54
- "step": 732
55
- },
56
- {
57
- "epoch": 3.9960454139558617,
58
- "grad_norm": 0.15589652955532074,
59
- "learning_rate": 9.333333333333334e-05,
60
- "loss": 0.8778,
61
- "step": 976
62
- },
63
- {
64
- "epoch": 3.9960454139558617,
65
- "eval_loss": 0.9012424349784851,
66
- "eval_runtime": 25.3688,
67
- "eval_samples_per_second": 32.56,
68
- "eval_steps_per_second": 16.28,
69
- "step": 976
70
- },
71
- {
72
- "epoch": 4.996045413955862,
73
- "grad_norm": 0.11001910269260406,
74
- "learning_rate": 9.166666666666667e-05,
75
- "loss": 0.854,
76
- "step": 1220
77
- },
78
- {
79
- "epoch": 4.996045413955862,
80
- "eval_loss": 0.881626546382904,
81
- "eval_runtime": 25.3578,
82
- "eval_samples_per_second": 32.574,
83
- "eval_steps_per_second": 16.287,
84
- "step": 1220
85
- },
86
- {
87
- "epoch": 5.996045413955862,
88
- "grad_norm": 0.09984524548053741,
89
- "learning_rate": 9e-05,
90
- "loss": 0.8412,
91
- "step": 1464
92
- },
93
- {
94
- "epoch": 5.996045413955862,
95
- "eval_loss": 0.8756476044654846,
96
- "eval_runtime": 25.3658,
97
- "eval_samples_per_second": 32.564,
98
- "eval_steps_per_second": 16.282,
99
- "step": 1464
100
- },
101
- {
102
- "epoch": 6.996045413955862,
103
- "grad_norm": 0.08100161701440811,
104
- "learning_rate": 8.833333333333333e-05,
105
- "loss": 0.8331,
106
- "step": 1708
107
- },
108
- {
109
- "epoch": 6.996045413955862,
110
- "eval_loss": 0.8662193417549133,
111
- "eval_runtime": 25.367,
112
- "eval_samples_per_second": 32.562,
113
- "eval_steps_per_second": 16.281,
114
- "step": 1708
115
- },
116
- {
117
- "epoch": 7.996045413955862,
118
- "grad_norm": 0.07314834743738174,
119
- "learning_rate": 8.666666666666667e-05,
120
- "loss": 0.828,
121
- "step": 1952
122
- },
123
- {
124
- "epoch": 7.996045413955862,
125
- "eval_loss": 0.8645371198654175,
126
- "eval_runtime": 25.3405,
127
- "eval_samples_per_second": 32.596,
128
- "eval_steps_per_second": 16.298,
129
- "step": 1952
130
- },
131
- {
132
- "epoch": 8.996045413955862,
133
- "grad_norm": 0.06718147546052933,
134
- "learning_rate": 8.5e-05,
135
- "loss": 0.8245,
136
- "step": 2196
137
- },
138
- {
139
- "epoch": 8.996045413955862,
140
- "eval_loss": 0.8601691722869873,
141
- "eval_runtime": 25.3375,
142
- "eval_samples_per_second": 32.6,
143
- "eval_steps_per_second": 16.3,
144
- "step": 2196
145
- },
146
- {
147
- "epoch": 9.996045413955862,
148
- "grad_norm": 0.059008464217185974,
149
- "learning_rate": 8.333333333333334e-05,
150
- "loss": 0.822,
151
- "step": 2440
152
- },
153
- {
154
- "epoch": 9.996045413955862,
155
- "eval_loss": 0.8585366010665894,
156
- "eval_runtime": 25.3797,
157
- "eval_samples_per_second": 32.546,
158
- "eval_steps_per_second": 16.273,
159
- "step": 2440
160
- },
161
- {
162
- "epoch": 10.996045413955862,
163
- "grad_norm": 0.05849480628967285,
164
- "learning_rate": 8.166666666666667e-05,
165
- "loss": 0.82,
166
- "step": 2684
167
- },
168
- {
169
- "epoch": 10.996045413955862,
170
- "eval_loss": 0.8567091226577759,
171
- "eval_runtime": 25.3556,
172
- "eval_samples_per_second": 32.577,
173
- "eval_steps_per_second": 16.288,
174
- "step": 2684
175
- },
176
- {
177
- "epoch": 11.996045413955862,
178
- "grad_norm": 0.059530675411224365,
179
- "learning_rate": 8e-05,
180
- "loss": 0.8186,
181
- "step": 2928
182
- },
183
- {
184
- "epoch": 11.996045413955862,
185
- "eval_loss": 0.8564208745956421,
186
- "eval_runtime": 25.3424,
187
- "eval_samples_per_second": 32.594,
188
- "eval_steps_per_second": 16.297,
189
- "step": 2928
190
- },
191
- {
192
- "epoch": 12.996045413955862,
193
- "grad_norm": 0.06416182219982147,
194
- "learning_rate": 7.833333333333333e-05,
195
- "loss": 0.8173,
196
- "step": 3172
197
- },
198
- {
199
- "epoch": 12.996045413955862,
200
- "eval_loss": 0.8540862798690796,
201
- "eval_runtime": 25.3529,
202
- "eval_samples_per_second": 32.58,
203
- "eval_steps_per_second": 16.29,
204
- "step": 3172
205
- },
206
- {
207
- "epoch": 13.996045413955862,
208
- "grad_norm": 0.05353016406297684,
209
- "learning_rate": 7.666666666666667e-05,
210
- "loss": 0.8171,
211
- "step": 3416
212
- },
213
- {
214
- "epoch": 13.996045413955862,
215
- "eval_loss": 0.8535267114639282,
216
- "eval_runtime": 25.3125,
217
- "eval_samples_per_second": 32.632,
218
- "eval_steps_per_second": 16.316,
219
- "step": 3416
220
- },
221
- {
222
- "epoch": 14.996045413955862,
223
- "grad_norm": 0.04900681599974632,
224
- "learning_rate": 7.500000000000001e-05,
225
- "loss": 0.8151,
226
- "step": 3660
227
- },
228
- {
229
- "epoch": 14.996045413955862,
230
- "eval_loss": 0.8519299626350403,
231
- "eval_runtime": 25.3532,
232
- "eval_samples_per_second": 32.58,
233
- "eval_steps_per_second": 16.29,
234
- "step": 3660
235
- },
236
- {
237
- "epoch": 15.996045413955862,
238
- "grad_norm": 0.0492498017847538,
239
- "learning_rate": 7.333333333333333e-05,
240
- "loss": 0.8143,
241
- "step": 3904
242
- },
243
- {
244
- "epoch": 15.996045413955862,
245
- "eval_loss": 0.8498228788375854,
246
- "eval_runtime": 25.3097,
247
- "eval_samples_per_second": 32.636,
248
- "eval_steps_per_second": 16.318,
249
- "step": 3904
250
- },
251
- {
252
- "epoch": 16.996045413955862,
253
- "grad_norm": 0.04555810987949371,
254
- "learning_rate": 7.166666666666667e-05,
255
- "loss": 0.8136,
256
- "step": 4148
257
- },
258
- {
259
- "epoch": 16.996045413955862,
260
- "eval_loss": 0.8518642783164978,
261
- "eval_runtime": 25.326,
262
- "eval_samples_per_second": 32.615,
263
- "eval_steps_per_second": 16.307,
264
- "step": 4148
265
- },
266
- {
267
- "epoch": 17.996045413955862,
268
- "grad_norm": 0.03729957342147827,
269
- "learning_rate": 7e-05,
270
- "loss": 0.8134,
271
- "step": 4392
272
- },
273
- {
274
- "epoch": 17.996045413955862,
275
- "eval_loss": 0.8507369756698608,
276
- "eval_runtime": 25.3562,
277
- "eval_samples_per_second": 32.576,
278
- "eval_steps_per_second": 16.288,
279
- "step": 4392
280
- },
281
- {
282
- "epoch": 18.996045413955862,
283
- "grad_norm": 0.037404902279376984,
284
- "learning_rate": 6.833333333333333e-05,
285
- "loss": 0.8124,
286
- "step": 4636
287
- },
288
- {
289
- "epoch": 18.996045413955862,
290
- "eval_loss": 0.8495596647262573,
291
- "eval_runtime": 25.3504,
292
- "eval_samples_per_second": 32.583,
293
- "eval_steps_per_second": 16.292,
294
- "step": 4636
295
- },
296
- {
297
- "epoch": 19.996045413955862,
298
- "grad_norm": 0.045253317803144455,
299
- "learning_rate": 6.666666666666667e-05,
300
- "loss": 0.8117,
301
- "step": 4880
302
- },
303
- {
304
- "epoch": 19.996045413955862,
305
- "eval_loss": 0.8522358536720276,
306
- "eval_runtime": 25.356,
307
- "eval_samples_per_second": 32.576,
308
- "eval_steps_per_second": 16.288,
309
- "step": 4880
310
- },
311
- {
312
- "epoch": 20.996045413955862,
313
- "grad_norm": 0.04286725074052811,
314
- "learning_rate": 6.500000000000001e-05,
315
- "loss": 0.8133,
316
- "step": 5124
317
- },
318
- {
319
- "epoch": 20.996045413955862,
320
- "eval_loss": 0.8517967462539673,
321
- "eval_runtime": 25.3545,
322
- "eval_samples_per_second": 32.578,
323
- "eval_steps_per_second": 16.289,
324
- "step": 5124
325
- },
326
- {
327
- "epoch": 21.996045413955862,
328
- "grad_norm": 0.033235229551792145,
329
- "learning_rate": 6.333333333333333e-05,
330
- "loss": 0.8128,
331
- "step": 5368
332
- },
333
- {
334
- "epoch": 21.996045413955862,
335
- "eval_loss": 0.8513291478157043,
336
- "eval_runtime": 25.3035,
337
- "eval_samples_per_second": 32.644,
338
- "eval_steps_per_second": 16.322,
339
- "step": 5368
340
  }
341
  ],
342
  "logging_steps": 500,
@@ -351,7 +36,7 @@
351
  "early_stopping_threshold": 0.0
352
  },
353
  "attributes": {
354
- "early_stopping_patience_counter": 3
355
  }
356
  },
357
  "TrainerControl": {
 
1
  {
2
+ "best_metric": 1.3716533184051514,
3
+ "best_model_checkpoint": "mgh6/HTH_prob/checkpoint-244",
4
+ "epoch": 0.9960454139558618,
5
  "eval_steps": 500,
6
+ "global_step": 244,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.9960454139558618,
13
+ "grad_norm": 0.04875887930393219,
14
+ "learning_rate": 0.0009833333333333332,
15
+ "loss": 1.3837,
16
  "step": 244
17
  },
18
  {
19
  "epoch": 0.9960454139558618,
20
+ "eval_loss": 1.3716533184051514,
21
+ "eval_runtime": 25.1667,
22
+ "eval_samples_per_second": 32.821,
23
+ "eval_steps_per_second": 16.411,
24
  "step": 244
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  ],
27
  "logging_steps": 500,
 
36
  "early_stopping_threshold": 0.0
37
  },
38
  "attributes": {
39
+ "early_stopping_patience_counter": 0
40
  }
41
  },
42
  "TrainerControl": {
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:06b76751df474367f2aad140f11a5cac938596d395fdaf77198812027cdca85a
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:667cf2698d9700cfae15eb710eb29a7fe4af2b3f185b9513c6453b73cf79787c
3
  size 5368