IDS75912 commited on
Commit
cd6a72d
·
verified ·
1 Parent(s): cd3aae2

End of training

Browse files
README.md CHANGED
@@ -1,11 +1,7 @@
1
  ---
2
  library_name: transformers
3
- license: cc-by-nc-4.0
4
- base_model: MCG-NJU/videomae-base
5
  tags:
6
  - generated_from_trainer
7
- metrics:
8
- - accuracy
9
  model-index:
10
  - name: videomae-base-finetuned-dogBehavior
11
  results: []
@@ -16,10 +12,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # videomae-base-finetuned-dogBehavior
18
 
19
- This model is a fine-tuned version of [MCG-NJU/videomae-base](https://huggingface.co/MCG-NJU/videomae-base) on an unknown dataset.
20
- It achieves the following results on the evaluation set:
21
- - Loss: 0.7947
22
- - Accuracy: 0.8387
23
 
24
  ## Model description
25
 
@@ -42,45 +35,14 @@ The following hyperparameters were used during training:
42
  - train_batch_size: 8
43
  - eval_batch_size: 8
44
  - seed: 42
45
- - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - lr_scheduler_warmup_ratio: 0.1
48
- - training_steps: 750
49
-
50
- ### Training results
51
-
52
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
- |:-------------:|:-------:|:----:|:---------------:|:--------:|
54
- | 1.5395 | 0.0413 | 31 | 1.5124 | 0.3226 |
55
- | 1.4347 | 1.0413 | 62 | 1.4659 | 0.3226 |
56
- | 1.3176 | 2.0413 | 93 | 1.4264 | 0.4839 |
57
- | 1.2765 | 3.0413 | 124 | 0.9989 | 0.7097 |
58
- | 1.1538 | 4.0413 | 155 | 1.0743 | 0.6129 |
59
- | 1.0894 | 5.0413 | 186 | 1.0309 | 0.6452 |
60
- | 0.705 | 6.0413 | 217 | 0.8872 | 0.7097 |
61
- | 0.8064 | 7.0413 | 248 | 1.0081 | 0.7097 |
62
- | 0.6057 | 8.0413 | 279 | 0.8231 | 0.7419 |
63
- | 0.493 | 9.0413 | 310 | 0.6791 | 0.7742 |
64
- | 0.6431 | 10.0413 | 341 | 0.7946 | 0.6129 |
65
- | 0.397 | 11.0413 | 372 | 0.8140 | 0.7742 |
66
- | 0.405 | 12.0413 | 403 | 0.8367 | 0.7742 |
67
- | 0.5463 | 13.0413 | 434 | 0.6394 | 0.7419 |
68
- | 0.3068 | 14.0413 | 465 | 0.8780 | 0.7419 |
69
- | 0.1043 | 15.0413 | 496 | 0.7384 | 0.7419 |
70
- | 0.3512 | 16.0413 | 527 | 0.9134 | 0.7419 |
71
- | 0.1996 | 17.0413 | 558 | 0.9459 | 0.7742 |
72
- | 0.2886 | 18.0413 | 589 | 0.7947 | 0.8387 |
73
- | 0.257 | 19.0413 | 620 | 0.9584 | 0.8065 |
74
- | 0.3948 | 20.0413 | 651 | 0.9726 | 0.7742 |
75
- | 0.1969 | 21.0413 | 682 | 0.8744 | 0.8065 |
76
- | 0.1238 | 22.0413 | 713 | 0.8290 | 0.8065 |
77
- | 0.1539 | 23.0413 | 744 | 0.8723 | 0.8065 |
78
- | 0.2076 | 24.008 | 750 | 0.8719 | 0.8065 |
79
-
80
 
81
  ### Framework versions
82
 
83
- - Transformers 4.44.1
84
- - Pytorch 2.5.1
85
- - Datasets 3.3.2
86
- - Tokenizers 0.19.1
 
1
  ---
2
  library_name: transformers
 
 
3
  tags:
4
  - generated_from_trainer
 
 
5
  model-index:
6
  - name: videomae-base-finetuned-dogBehavior
7
  results: []
 
12
 
13
  # videomae-base-finetuned-dogBehavior
14
 
15
+ This model was trained from scratch on an unknown dataset.
 
 
 
16
 
17
  ## Model description
18
 
 
35
  - train_batch_size: 8
36
  - eval_batch_size: 8
37
  - seed: 42
38
+ - optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
39
  - lr_scheduler_type: linear
40
  - lr_scheduler_warmup_ratio: 0.1
41
+ - training_steps: 125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  ### Framework versions
44
 
45
+ - Transformers 4.57.3
46
+ - Pytorch 2.9.1+cu130
47
+ - Datasets 4.4.2
48
+ - Tokenizers 0.22.2
all_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 24.008,
3
- "eval_accuracy": 0.8387096774193549,
4
- "eval_loss": 0.7947272062301636,
5
- "eval_model_preparation_time": 0.0029,
6
- "eval_runtime": 53.1896,
7
- "eval_samples_per_second": 0.583,
8
- "eval_steps_per_second": 0.075
9
  }
 
1
  {
2
+ "eval_accuracy": 0.9393939393939394,
3
+ "eval_loss": 0.1900058090686798,
4
+ "eval_model_preparation_time": 0.0031,
5
+ "eval_runtime": 58.6079,
6
+ "eval_samples_per_second": 0.563,
7
+ "eval_steps_per_second": 0.085
 
8
  }
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "MCG-NJU/videomae-base",
3
  "architectures": [
4
  "VideoMAEForVideoClassification"
5
  ],
@@ -8,25 +7,20 @@
8
  "decoder_intermediate_size": 1536,
9
  "decoder_num_attention_heads": 6,
10
  "decoder_num_hidden_layers": 4,
 
11
  "hidden_act": "gelu",
12
  "hidden_dropout_prob": 0.0,
13
  "hidden_size": 768,
14
  "id2label": {
15
- "0": "justsniff04_10",
16
- "1": "lying04_10",
17
- "2": "scratch04_10",
18
- "3": "specialsniff13_10",
19
- "4": "turn04_10"
20
  },
21
  "image_size": 224,
22
  "initializer_range": 0.02,
23
  "intermediate_size": 3072,
24
  "label2id": {
25
- "justsniff04_10": 0,
26
- "lying04_10": 1,
27
- "scratch04_10": 2,
28
- "specialsniff13_10": 3,
29
- "turn04_10": 4
30
  },
31
  "layer_norm_eps": 1e-12,
32
  "model_type": "videomae",
@@ -38,8 +32,7 @@
38
  "patch_size": 16,
39
  "problem_type": "single_label_classification",
40
  "qkv_bias": true,
41
- "torch_dtype": "float32",
42
- "transformers_version": "4.44.1",
43
  "tubelet_size": 2,
44
  "use_mean_pooling": false
45
  }
 
1
  {
 
2
  "architectures": [
3
  "VideoMAEForVideoClassification"
4
  ],
 
7
  "decoder_intermediate_size": 1536,
8
  "decoder_num_attention_heads": 6,
9
  "decoder_num_hidden_layers": 4,
10
+ "dtype": "float32",
11
  "hidden_act": "gelu",
12
  "hidden_dropout_prob": 0.0,
13
  "hidden_size": 768,
14
  "id2label": {
15
+ "0": "justsniff",
16
+ "1": "lying"
 
 
 
17
  },
18
  "image_size": 224,
19
  "initializer_range": 0.02,
20
  "intermediate_size": 3072,
21
  "label2id": {
22
+ "justsniff": 0,
23
+ "lying": 1
 
 
 
24
  },
25
  "layer_norm_eps": 1e-12,
26
  "model_type": "videomae",
 
32
  "patch_size": 16,
33
  "problem_type": "single_label_classification",
34
  "qkv_bias": true,
35
+ "transformers_version": "4.57.3",
 
36
  "tubelet_size": 2,
37
  "use_mean_pooling": false
38
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:329e5beb2e7c19b63904340546ca3fe317b51f46653b70d0e0d919a805f31346
3
- size 344946604
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94704761688e0370a2ad99340d995c6285595680d147cfce675b800a1f40cb95
3
+ size 344937368
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b6158a434eaa32367a7776f09cc1d8fc889d6f84264e016458cc6d6e64cffee
3
- size 690010170
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b6c4bf485686dc2460f20adf662bf6ef21a6f5f482367135c6e12dae3122b67
3
+ size 689988619
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14685d0a57d2791329f7ea3e95fe4981ccfcaaee8b8e664e80ce9f66919b53af
3
- size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a33122a368dd627dd121c5da60672d47297ea002d4ed2ce3715b215c4f2e6ba4
3
+ size 14645
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b8342b8994d7c35e7b92b2cd289f9708f3fa204c907223cee09ab2392b162ab7
3
- size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215f49c99657537ce1b0b1d277562ab616c1c5f0831e119eb6f1022948171c72
3
+ size 1465
test_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 24.008,
3
- "eval_accuracy": 0.8387096774193549,
4
- "eval_loss": 0.7947272062301636,
5
- "eval_runtime": 53.1896,
6
- "eval_samples_per_second": 0.583,
7
- "eval_steps_per_second": 0.075
8
  }
 
1
  {
2
+ "eval_accuracy": 0.9393939393939394,
3
+ "eval_loss": 0.1900058090686798,
4
+ "eval_model_preparation_time": 0.0031,
5
+ "eval_runtime": 58.6079,
6
+ "eval_samples_per_second": 0.563,
7
+ "eval_steps_per_second": 0.085
8
  }
trainer_state.json CHANGED
@@ -1,802 +1,464 @@
1
  {
2
- "best_metric": 0.8387096774193549,
3
- "best_model_checkpoint": "videomae-base-finetuned-dogBehavior/checkpoint-589",
4
- "epoch": 24.008,
 
5
  "eval_steps": 500,
6
- "global_step": 750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.013333333333333334,
13
- "grad_norm": 6.676718711853027,
14
- "learning_rate": 6.666666666666667e-06,
15
- "loss": 1.6401,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.02666666666666667,
20
- "grad_norm": 6.231757640838623,
21
- "learning_rate": 1.3333333333333333e-05,
22
- "loss": 1.6176,
23
  "step": 20
24
  },
25
  {
26
  "epoch": 0.04,
27
- "grad_norm": 11.501175880432129,
28
- "learning_rate": 2e-05,
29
- "loss": 1.5395,
 
 
 
 
 
 
 
 
 
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.04133333333333333,
34
- "eval_accuracy": 0.3225806451612903,
35
- "eval_loss": 1.5124112367630005,
36
- "eval_runtime": 48.7158,
37
- "eval_samples_per_second": 0.636,
38
- "eval_steps_per_second": 0.082,
39
- "step": 31
40
  },
41
  {
42
- "epoch": 1.012,
43
- "grad_norm": 7.805171012878418,
44
- "learning_rate": 2.6666666666666667e-05,
45
- "loss": 1.5215,
46
- "step": 40
47
  },
48
  {
49
- "epoch": 1.0253333333333334,
50
- "grad_norm": 6.267619609832764,
51
- "learning_rate": 3.3333333333333335e-05,
52
- "loss": 1.5229,
 
 
53
  "step": 50
54
  },
55
  {
56
- "epoch": 1.0386666666666666,
57
- "grad_norm": 9.20750904083252,
58
- "learning_rate": 4e-05,
59
- "loss": 1.4347,
60
  "step": 60
61
  },
62
  {
63
- "epoch": 1.0413333333333332,
64
- "eval_accuracy": 0.3225806451612903,
65
- "eval_loss": 1.4659382104873657,
66
- "eval_runtime": 48.9209,
67
- "eval_samples_per_second": 0.634,
68
- "eval_steps_per_second": 0.082,
69
- "step": 62
70
  },
71
  {
72
- "epoch": 2.010666666666667,
73
- "grad_norm": 7.96211051940918,
74
- "learning_rate": 4.666666666666667e-05,
75
- "loss": 1.4278,
76
- "step": 70
 
 
77
  },
78
  {
79
- "epoch": 2.024,
80
- "grad_norm": 9.138361930847168,
81
- "learning_rate": 4.962962962962963e-05,
82
- "loss": 1.463,
83
  "step": 80
84
  },
85
  {
86
- "epoch": 2.037333333333333,
87
- "grad_norm": 20.494976043701172,
88
- "learning_rate": 4.888888888888889e-05,
89
- "loss": 1.3176,
90
  "step": 90
91
  },
92
  {
93
- "epoch": 2.041333333333333,
94
- "eval_accuracy": 0.4838709677419355,
95
- "eval_loss": 1.426421880722046,
96
- "eval_runtime": 48.6191,
97
- "eval_samples_per_second": 0.638,
98
- "eval_steps_per_second": 0.082,
99
- "step": 93
100
  },
101
  {
102
- "epoch": 3.009333333333333,
103
- "grad_norm": 5.1405029296875,
104
- "learning_rate": 4.814814814814815e-05,
105
- "loss": 1.2299,
 
 
106
  "step": 100
107
  },
108
  {
109
- "epoch": 3.022666666666667,
110
- "grad_norm": 12.849315643310547,
111
- "learning_rate": 4.740740740740741e-05,
112
- "loss": 1.3821,
113
  "step": 110
114
  },
115
  {
116
- "epoch": 3.036,
117
- "grad_norm": 10.184627532958984,
118
- "learning_rate": 4.666666666666667e-05,
119
- "loss": 1.2765,
120
  "step": 120
121
  },
122
  {
123
- "epoch": 3.041333333333333,
124
- "eval_accuracy": 0.7096774193548387,
125
- "eval_loss": 0.9989431500434875,
126
- "eval_runtime": 48.7434,
127
- "eval_samples_per_second": 0.636,
128
- "eval_steps_per_second": 0.082,
129
- "step": 124
130
  },
131
  {
132
- "epoch": 4.008,
133
- "grad_norm": 12.079005241394043,
134
- "learning_rate": 4.592592592592593e-05,
135
- "loss": 0.9981,
136
  "step": 130
137
  },
138
  {
139
- "epoch": 4.021333333333334,
140
- "grad_norm": 6.735375881195068,
141
- "learning_rate": 4.518518518518519e-05,
142
- "loss": 0.9428,
143
  "step": 140
144
  },
145
  {
146
- "epoch": 4.034666666666666,
147
- "grad_norm": 5.291360378265381,
148
- "learning_rate": 4.4444444444444447e-05,
149
- "loss": 1.1538,
150
  "step": 150
151
  },
152
  {
153
- "epoch": 4.041333333333333,
154
- "eval_accuracy": 0.6129032258064516,
155
- "eval_loss": 1.074262261390686,
156
- "eval_runtime": 48.0738,
157
- "eval_samples_per_second": 0.645,
158
- "eval_steps_per_second": 0.083,
159
- "step": 155
160
  },
161
  {
162
- "epoch": 5.006666666666667,
163
- "grad_norm": 19.864482879638672,
164
- "learning_rate": 4.3703703703703705e-05,
165
- "loss": 0.6742,
166
  "step": 160
167
  },
168
  {
169
- "epoch": 5.02,
170
- "grad_norm": 8.207176208496094,
171
- "learning_rate": 4.296296296296296e-05,
172
- "loss": 1.2777,
173
  "step": 170
174
  },
175
  {
176
- "epoch": 5.033333333333333,
177
- "grad_norm": 8.51258373260498,
178
- "learning_rate": 4.222222222222222e-05,
179
- "loss": 1.0894,
180
- "step": 180
 
 
181
  },
182
  {
183
- "epoch": 5.041333333333333,
184
- "eval_accuracy": 0.6451612903225806,
185
- "eval_loss": 1.0308966636657715,
186
- "eval_runtime": 49.5518,
187
- "eval_samples_per_second": 0.626,
188
- "eval_steps_per_second": 0.081,
189
- "step": 186
190
  },
191
  {
192
- "epoch": 6.005333333333334,
193
- "grad_norm": 12.326302528381348,
194
- "learning_rate": 4.148148148148148e-05,
195
- "loss": 1.0028,
196
  "step": 190
197
  },
198
  {
199
- "epoch": 6.018666666666666,
200
- "grad_norm": 11.374725341796875,
201
- "learning_rate": 4.074074074074074e-05,
202
- "loss": 1.0612,
203
  "step": 200
204
  },
205
  {
206
- "epoch": 6.032,
207
- "grad_norm": 10.230300903320312,
208
- "learning_rate": 4e-05,
209
- "loss": 0.705,
210
- "step": 210
 
 
211
  },
212
  {
213
- "epoch": 6.041333333333333,
214
- "eval_accuracy": 0.7096774193548387,
215
- "eval_loss": 0.8872017860412598,
216
- "eval_runtime": 48.4227,
217
- "eval_samples_per_second": 0.64,
218
- "eval_steps_per_second": 0.083,
219
- "step": 217
220
  },
221
  {
222
- "epoch": 7.004,
223
- "grad_norm": 15.614605903625488,
224
- "learning_rate": 3.925925925925926e-05,
225
- "loss": 0.6634,
226
  "step": 220
227
  },
228
  {
229
- "epoch": 7.017333333333333,
230
- "grad_norm": 13.820854187011719,
231
- "learning_rate": 3.851851851851852e-05,
232
- "loss": 0.7886,
 
 
 
 
 
 
 
 
 
233
  "step": 230
234
  },
235
  {
236
- "epoch": 7.030666666666667,
237
- "grad_norm": 5.780624866485596,
238
- "learning_rate": 3.777777777777778e-05,
239
- "loss": 0.8064,
240
  "step": 240
241
  },
242
  {
243
- "epoch": 7.041333333333333,
244
- "eval_accuracy": 0.7096774193548387,
245
- "eval_loss": 1.0080758333206177,
246
- "eval_runtime": 48.4037,
247
- "eval_samples_per_second": 0.64,
248
- "eval_steps_per_second": 0.083,
249
- "step": 248
250
  },
251
  {
252
- "epoch": 8.002666666666666,
253
- "grad_norm": 15.558658599853516,
254
- "learning_rate": 3.7037037037037037e-05,
255
- "loss": 0.6587,
 
 
256
  "step": 250
257
  },
258
  {
259
- "epoch": 8.016,
260
- "grad_norm": 12.6831636428833,
261
- "learning_rate": 3.62962962962963e-05,
262
- "loss": 0.6791,
263
  "step": 260
264
  },
265
  {
266
- "epoch": 8.029333333333334,
267
- "grad_norm": 7.261561870574951,
268
- "learning_rate": 3.555555555555556e-05,
269
- "loss": 0.6057,
270
  "step": 270
271
  },
272
  {
273
- "epoch": 8.041333333333334,
274
- "eval_accuracy": 0.7419354838709677,
275
- "eval_loss": 0.8230811357498169,
276
- "eval_runtime": 49.1888,
277
- "eval_samples_per_second": 0.63,
278
- "eval_steps_per_second": 0.081,
279
- "step": 279
280
- },
281
- {
282
- "epoch": 9.001333333333333,
283
- "grad_norm": 12.9092435836792,
284
- "learning_rate": 3.481481481481482e-05,
285
- "loss": 0.7389,
286
  "step": 280
287
  },
288
  {
289
- "epoch": 9.014666666666667,
290
- "grad_norm": 7.087100028991699,
291
- "learning_rate": 3.4074074074074077e-05,
292
- "loss": 0.4613,
293
  "step": 290
294
  },
295
  {
296
- "epoch": 9.028,
297
- "grad_norm": 18.20285415649414,
298
- "learning_rate": 3.3333333333333335e-05,
299
- "loss": 0.7337,
300
  "step": 300
301
  },
302
  {
303
- "epoch": 9.041333333333334,
304
- "grad_norm": 8.108297348022461,
305
- "learning_rate": 3.25925925925926e-05,
306
- "loss": 0.493,
307
- "step": 310
 
 
308
  },
309
  {
310
- "epoch": 9.041333333333334,
311
- "eval_accuracy": 0.7741935483870968,
312
- "eval_loss": 0.6791331171989441,
313
- "eval_runtime": 48.1,
314
- "eval_samples_per_second": 0.644,
315
- "eval_steps_per_second": 0.083,
316
  "step": 310
317
  },
318
  {
319
- "epoch": 10.013333333333334,
320
- "grad_norm": 6.157079696655273,
321
- "learning_rate": 3.185185185185185e-05,
322
- "loss": 0.4432,
323
  "step": 320
324
  },
325
  {
326
- "epoch": 10.026666666666667,
327
- "grad_norm": 14.456326484680176,
328
- "learning_rate": 3.111111111111111e-05,
329
- "loss": 0.5062,
 
 
 
 
 
 
 
 
 
330
  "step": 330
331
  },
332
  {
333
- "epoch": 10.04,
334
- "grad_norm": 12.937379837036133,
335
- "learning_rate": 3.037037037037037e-05,
336
- "loss": 0.6431,
337
  "step": 340
338
  },
339
  {
340
- "epoch": 10.041333333333334,
341
- "eval_accuracy": 0.6129032258064516,
342
- "eval_loss": 0.7945635914802551,
343
- "eval_runtime": 95.0741,
344
- "eval_samples_per_second": 0.326,
345
- "eval_steps_per_second": 0.042,
346
- "step": 341
347
  },
348
  {
349
- "epoch": 11.012,
350
- "grad_norm": 16.46090316772461,
351
- "learning_rate": 2.962962962962963e-05,
352
- "loss": 0.4597,
 
 
353
  "step": 350
354
  },
355
  {
356
- "epoch": 11.025333333333334,
357
- "grad_norm": 6.265634536743164,
358
- "learning_rate": 2.8888888888888888e-05,
359
- "loss": 0.4432,
360
  "step": 360
361
  },
362
  {
363
- "epoch": 11.038666666666666,
364
- "grad_norm": 5.195762634277344,
365
- "learning_rate": 2.814814814814815e-05,
366
- "loss": 0.397,
367
  "step": 370
368
  },
369
  {
370
- "epoch": 11.041333333333334,
371
- "eval_accuracy": 0.7741935483870968,
372
- "eval_loss": 0.8140308856964111,
373
- "eval_runtime": 47.9223,
374
- "eval_samples_per_second": 0.647,
375
- "eval_steps_per_second": 0.083,
376
- "step": 372
377
  },
378
  {
379
- "epoch": 12.010666666666667,
380
- "grad_norm": 28.242441177368164,
381
- "learning_rate": 2.7407407407407408e-05,
382
- "loss": 0.5518,
383
  "step": 380
384
  },
385
  {
386
- "epoch": 12.024,
387
- "grad_norm": 9.544962882995605,
388
- "learning_rate": 2.6666666666666667e-05,
389
- "loss": 0.2734,
390
  "step": 390
391
  },
392
  {
393
- "epoch": 12.037333333333333,
394
- "grad_norm": 23.855772018432617,
395
- "learning_rate": 2.5925925925925925e-05,
396
- "loss": 0.405,
397
  "step": 400
398
  },
399
  {
400
- "epoch": 12.041333333333334,
401
- "eval_accuracy": 0.7741935483870968,
402
- "eval_loss": 0.8367263674736023,
403
- "eval_runtime": 94.0774,
404
- "eval_samples_per_second": 0.33,
405
- "eval_steps_per_second": 0.043,
406
- "step": 403
407
  },
408
  {
409
- "epoch": 13.009333333333334,
410
- "grad_norm": 9.954264640808105,
411
- "learning_rate": 2.5185185185185183e-05,
412
- "loss": 0.3852,
413
  "step": 410
414
  },
415
- {
416
- "epoch": 13.022666666666666,
417
- "grad_norm": 2.9040560722351074,
418
- "learning_rate": 2.4444444444444445e-05,
419
- "loss": 0.3197,
420
- "step": 420
421
- },
422
- {
423
- "epoch": 13.036,
424
- "grad_norm": 6.378393650054932,
425
- "learning_rate": 2.3703703703703707e-05,
426
- "loss": 0.5463,
427
- "step": 430
428
- },
429
- {
430
- "epoch": 13.041333333333334,
431
- "eval_accuracy": 0.7419354838709677,
432
- "eval_loss": 0.6393604278564453,
433
- "eval_runtime": 96.7879,
434
- "eval_samples_per_second": 0.32,
435
- "eval_steps_per_second": 0.041,
436
- "step": 434
437
- },
438
- {
439
- "epoch": 14.008,
440
- "grad_norm": 6.623013496398926,
441
- "learning_rate": 2.2962962962962965e-05,
442
- "loss": 0.4882,
443
- "step": 440
444
- },
445
- {
446
- "epoch": 14.021333333333333,
447
- "grad_norm": 10.516149520874023,
448
- "learning_rate": 2.2222222222222223e-05,
449
- "loss": 0.3528,
450
- "step": 450
451
- },
452
- {
453
- "epoch": 14.034666666666666,
454
- "grad_norm": 0.7739436626434326,
455
- "learning_rate": 2.148148148148148e-05,
456
- "loss": 0.3068,
457
- "step": 460
458
- },
459
- {
460
- "epoch": 14.041333333333334,
461
- "eval_accuracy": 0.7419354838709677,
462
- "eval_loss": 0.8780257701873779,
463
- "eval_runtime": 94.12,
464
- "eval_samples_per_second": 0.329,
465
- "eval_steps_per_second": 0.042,
466
- "step": 465
467
- },
468
- {
469
- "epoch": 15.006666666666666,
470
- "grad_norm": 13.473437309265137,
471
- "learning_rate": 2.074074074074074e-05,
472
- "loss": 0.4486,
473
- "step": 470
474
- },
475
- {
476
- "epoch": 15.02,
477
- "grad_norm": 11.189105987548828,
478
- "learning_rate": 2e-05,
479
- "loss": 0.2921,
480
- "step": 480
481
- },
482
- {
483
- "epoch": 15.033333333333333,
484
- "grad_norm": 5.033076763153076,
485
- "learning_rate": 1.925925925925926e-05,
486
- "loss": 0.1043,
487
- "step": 490
488
- },
489
- {
490
- "epoch": 15.041333333333334,
491
- "eval_accuracy": 0.7419354838709677,
492
- "eval_loss": 0.7384097576141357,
493
- "eval_runtime": 98.6686,
494
- "eval_samples_per_second": 0.314,
495
- "eval_steps_per_second": 0.041,
496
- "step": 496
497
- },
498
- {
499
- "epoch": 16.005333333333333,
500
- "grad_norm": 5.1849589347839355,
501
- "learning_rate": 1.8518518518518518e-05,
502
- "loss": 0.1256,
503
- "step": 500
504
- },
505
- {
506
- "epoch": 16.018666666666668,
507
- "grad_norm": 15.674521446228027,
508
- "learning_rate": 1.777777777777778e-05,
509
- "loss": 0.2178,
510
- "step": 510
511
- },
512
  {
513
  "epoch": 16.032,
514
- "grad_norm": 2.034404993057251,
515
- "learning_rate": 1.7037037037037038e-05,
516
- "loss": 0.3512,
517
- "step": 520
518
- },
519
- {
520
- "epoch": 16.041333333333334,
521
- "eval_accuracy": 0.7419354838709677,
522
- "eval_loss": 0.9134318232536316,
523
- "eval_runtime": 105.2236,
524
- "eval_samples_per_second": 0.295,
525
- "eval_steps_per_second": 0.038,
526
- "step": 527
527
- },
528
- {
529
- "epoch": 17.004,
530
- "grad_norm": 1.709756851196289,
531
- "learning_rate": 1.62962962962963e-05,
532
- "loss": 0.1933,
533
- "step": 530
534
- },
535
- {
536
- "epoch": 17.017333333333333,
537
- "grad_norm": 4.699275970458984,
538
- "learning_rate": 1.5555555555555555e-05,
539
- "loss": 0.2147,
540
- "step": 540
541
- },
542
- {
543
- "epoch": 17.030666666666665,
544
- "grad_norm": 19.59837532043457,
545
- "learning_rate": 1.4814814814814815e-05,
546
- "loss": 0.1996,
547
- "step": 550
548
- },
549
- {
550
- "epoch": 17.041333333333334,
551
- "eval_accuracy": 0.7741935483870968,
552
- "eval_loss": 0.9459213018417358,
553
- "eval_runtime": 104.9758,
554
- "eval_samples_per_second": 0.295,
555
- "eval_steps_per_second": 0.038,
556
- "step": 558
557
- },
558
- {
559
- "epoch": 18.002666666666666,
560
- "grad_norm": 6.645630836486816,
561
- "learning_rate": 1.4074074074074075e-05,
562
- "loss": 0.5076,
563
- "step": 560
564
- },
565
- {
566
- "epoch": 18.016,
567
- "grad_norm": 14.730420112609863,
568
- "learning_rate": 1.3333333333333333e-05,
569
- "loss": 0.2173,
570
- "step": 570
571
- },
572
- {
573
- "epoch": 18.029333333333334,
574
- "grad_norm": 1.0937589406967163,
575
- "learning_rate": 1.2592592592592592e-05,
576
- "loss": 0.2886,
577
- "step": 580
578
- },
579
- {
580
- "epoch": 18.041333333333334,
581
- "eval_accuracy": 0.8387096774193549,
582
- "eval_loss": 0.7947275042533875,
583
- "eval_runtime": 118.0527,
584
- "eval_samples_per_second": 0.263,
585
- "eval_steps_per_second": 0.034,
586
- "step": 589
587
- },
588
- {
589
- "epoch": 19.001333333333335,
590
- "grad_norm": 0.08054836094379425,
591
- "learning_rate": 1.1851851851851853e-05,
592
- "loss": 0.1476,
593
- "step": 590
594
- },
595
- {
596
- "epoch": 19.014666666666667,
597
- "grad_norm": 10.522760391235352,
598
- "learning_rate": 1.1111111111111112e-05,
599
- "loss": 0.1061,
600
- "step": 600
601
- },
602
- {
603
- "epoch": 19.028,
604
- "grad_norm": 28.20208168029785,
605
- "learning_rate": 1.037037037037037e-05,
606
- "loss": 0.2122,
607
- "step": 610
608
- },
609
- {
610
- "epoch": 19.041333333333334,
611
- "grad_norm": 1.5529413223266602,
612
- "learning_rate": 9.62962962962963e-06,
613
- "loss": 0.257,
614
- "step": 620
615
- },
616
- {
617
- "epoch": 19.041333333333334,
618
- "eval_accuracy": 0.8064516129032258,
619
- "eval_loss": 0.9583818316459656,
620
- "eval_runtime": 103.5542,
621
- "eval_samples_per_second": 0.299,
622
- "eval_steps_per_second": 0.039,
623
- "step": 620
624
- },
625
- {
626
- "epoch": 20.013333333333332,
627
- "grad_norm": 0.6693050265312195,
628
- "learning_rate": 8.88888888888889e-06,
629
- "loss": 0.0574,
630
- "step": 630
631
- },
632
- {
633
- "epoch": 20.026666666666667,
634
- "grad_norm": 0.20252086222171783,
635
- "learning_rate": 8.14814814814815e-06,
636
- "loss": 0.3624,
637
- "step": 640
638
- },
639
- {
640
- "epoch": 20.04,
641
- "grad_norm": 46.063262939453125,
642
- "learning_rate": 7.4074074074074075e-06,
643
- "loss": 0.3948,
644
- "step": 650
645
- },
646
- {
647
- "epoch": 20.041333333333334,
648
- "eval_accuracy": 0.7741935483870968,
649
- "eval_loss": 0.9725757837295532,
650
- "eval_runtime": 99.1534,
651
- "eval_samples_per_second": 0.313,
652
- "eval_steps_per_second": 0.04,
653
- "step": 651
654
- },
655
- {
656
- "epoch": 21.012,
657
- "grad_norm": 16.540903091430664,
658
- "learning_rate": 6.666666666666667e-06,
659
- "loss": 0.14,
660
- "step": 660
661
- },
662
- {
663
- "epoch": 21.025333333333332,
664
- "grad_norm": 0.8179630041122437,
665
- "learning_rate": 5.925925925925927e-06,
666
- "loss": 0.1677,
667
- "step": 670
668
- },
669
- {
670
- "epoch": 21.038666666666668,
671
- "grad_norm": 6.616237163543701,
672
- "learning_rate": 5.185185185185185e-06,
673
- "loss": 0.1969,
674
- "step": 680
675
- },
676
- {
677
- "epoch": 21.041333333333334,
678
- "eval_accuracy": 0.8064516129032258,
679
- "eval_loss": 0.8744015097618103,
680
- "eval_runtime": 100.2476,
681
- "eval_samples_per_second": 0.309,
682
- "eval_steps_per_second": 0.04,
683
- "step": 682
684
- },
685
- {
686
- "epoch": 22.010666666666665,
687
- "grad_norm": 0.2741946578025818,
688
- "learning_rate": 4.444444444444445e-06,
689
- "loss": 0.1749,
690
- "step": 690
691
- },
692
- {
693
- "epoch": 22.024,
694
- "grad_norm": 4.550361633300781,
695
- "learning_rate": 3.7037037037037037e-06,
696
- "loss": 0.1865,
697
- "step": 700
698
- },
699
- {
700
- "epoch": 22.037333333333333,
701
- "grad_norm": 1.1673641204833984,
702
- "learning_rate": 2.9629629629629633e-06,
703
- "loss": 0.1238,
704
- "step": 710
705
- },
706
- {
707
- "epoch": 22.041333333333334,
708
- "eval_accuracy": 0.8064516129032258,
709
- "eval_loss": 0.8289940357208252,
710
- "eval_runtime": 107.0685,
711
- "eval_samples_per_second": 0.29,
712
- "eval_steps_per_second": 0.037,
713
- "step": 713
714
- },
715
- {
716
- "epoch": 23.009333333333334,
717
- "grad_norm": 0.2685479521751404,
718
- "learning_rate": 2.2222222222222225e-06,
719
- "loss": 0.1107,
720
- "step": 720
721
- },
722
- {
723
- "epoch": 23.022666666666666,
724
- "grad_norm": 8.248332023620605,
725
- "learning_rate": 1.4814814814814817e-06,
726
- "loss": 0.1607,
727
- "step": 730
728
- },
729
- {
730
- "epoch": 23.036,
731
- "grad_norm": 0.12218274176120758,
732
- "learning_rate": 7.407407407407408e-07,
733
- "loss": 0.1539,
734
- "step": 740
735
- },
736
- {
737
- "epoch": 23.041333333333334,
738
- "eval_accuracy": 0.8064516129032258,
739
- "eval_loss": 0.8722597360610962,
740
- "eval_runtime": 101.2734,
741
- "eval_samples_per_second": 0.306,
742
- "eval_steps_per_second": 0.039,
743
- "step": 744
744
- },
745
- {
746
- "epoch": 24.008,
747
- "grad_norm": 21.209386825561523,
748
- "learning_rate": 0.0,
749
- "loss": 0.2076,
750
- "step": 750
751
- },
752
- {
753
- "epoch": 24.008,
754
- "eval_accuracy": 0.8064516129032258,
755
- "eval_loss": 0.8719311356544495,
756
- "eval_runtime": 97.5374,
757
- "eval_samples_per_second": 0.318,
758
- "eval_steps_per_second": 0.041,
759
- "step": 750
760
- },
761
- {
762
- "epoch": 24.008,
763
- "step": 750,
764
- "total_flos": 7.41677486274773e+18,
765
- "train_loss": 0.6099979252020518,
766
- "train_runtime": 27777.5595,
767
- "train_samples_per_second": 0.216,
768
- "train_steps_per_second": 0.027
769
- },
770
- {
771
- "epoch": 24.008,
772
- "eval_accuracy": 0.8387096774193549,
773
- "eval_loss": 0.7947275042533875,
774
- "eval_runtime": 43.5309,
775
- "eval_samples_per_second": 0.712,
776
- "eval_steps_per_second": 0.092,
777
- "step": 750
778
- },
779
- {
780
- "epoch": 24.008,
781
- "eval_accuracy": 0.8387096774193549,
782
- "eval_loss": 0.7947273850440979,
783
- "eval_runtime": 43.7384,
784
- "eval_samples_per_second": 0.709,
785
- "eval_steps_per_second": 0.091,
786
- "step": 750
787
  },
788
  {
789
- "epoch": 24.008,
790
- "eval_accuracy": 0.8387096774193549,
791
- "eval_loss": 0.7947272062301636,
792
- "eval_runtime": 53.1896,
793
- "eval_samples_per_second": 0.583,
794
  "eval_steps_per_second": 0.075,
795
- "step": 750
796
  }
797
  ],
798
  "logging_steps": 10,
799
- "max_steps": 750,
800
  "num_input_tokens_seen": 0,
801
  "num_train_epochs": 9223372036854775807,
802
  "save_steps": 500,
@@ -807,12 +469,12 @@
807
  "should_evaluate": false,
808
  "should_log": false,
809
  "should_save": true,
810
- "should_training_stop": true
811
  },
812
  "attributes": {}
813
  }
814
  },
815
- "total_flos": 7.41677486274773e+18,
816
  "train_batch_size": 8,
817
  "trial_name": null,
818
  "trial_params": null
 
1
  {
2
+ "best_global_step": 25,
3
+ "best_metric": 1.0,
4
+ "best_model_checkpoint": "videomae-base-finetuned-dogBehavior/checkpoint-25",
5
+ "epoch": 16.04,
6
  "eval_steps": 500,
7
+ "global_step": 425,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.016,
14
+ "grad_norm": 0.08260558545589447,
15
+ "learning_rate": 7.142857142857143e-06,
16
+ "loss": 0.0785,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.032,
21
+ "grad_norm": 0.0643758624792099,
22
+ "learning_rate": 1.5079365079365079e-05,
23
+ "loss": 0.1621,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.04,
28
+ "eval_accuracy": 1.0,
29
+ "eval_loss": 0.003746605012565851,
30
+ "eval_runtime": 65.8623,
31
+ "eval_samples_per_second": 0.501,
32
+ "eval_steps_per_second": 0.076,
33
+ "step": 25
34
+ },
35
+ {
36
+ "epoch": 1.008,
37
+ "grad_norm": 5.545346736907959,
38
+ "learning_rate": 2.3015873015873015e-05,
39
+ "loss": 0.1879,
40
  "step": 30
41
  },
42
  {
43
+ "epoch": 1.024,
44
+ "grad_norm": 0.2720630466938019,
45
+ "learning_rate": 3.095238095238095e-05,
46
+ "loss": 0.129,
47
+ "step": 40
 
 
48
  },
49
  {
50
+ "epoch": 1.04,
51
+ "grad_norm": 0.02173990197479725,
52
+ "learning_rate": 3.888888888888889e-05,
53
+ "loss": 0.0774,
54
+ "step": 50
55
  },
56
  {
57
+ "epoch": 1.04,
58
+ "eval_accuracy": 1.0,
59
+ "eval_loss": 0.0037209123838692904,
60
+ "eval_runtime": 68.3183,
61
+ "eval_samples_per_second": 0.483,
62
+ "eval_steps_per_second": 0.073,
63
  "step": 50
64
  },
65
  {
66
+ "epoch": 2.016,
67
+ "grad_norm": 0.1353379189968109,
68
+ "learning_rate": 4.682539682539683e-05,
69
+ "loss": 0.2455,
70
  "step": 60
71
  },
72
  {
73
+ "epoch": 2.032,
74
+ "grad_norm": 0.08677935600280762,
75
+ "learning_rate": 4.946619217081851e-05,
76
+ "loss": 0.2208,
77
+ "step": 70
 
 
78
  },
79
  {
80
+ "epoch": 2.04,
81
+ "eval_accuracy": 1.0,
82
+ "eval_loss": 0.0056559075601398945,
83
+ "eval_runtime": 58.9935,
84
+ "eval_samples_per_second": 0.559,
85
+ "eval_steps_per_second": 0.085,
86
+ "step": 75
87
  },
88
  {
89
+ "epoch": 3.008,
90
+ "grad_norm": 5.721180438995361,
91
+ "learning_rate": 4.8576512455516014e-05,
92
+ "loss": 0.1613,
93
  "step": 80
94
  },
95
  {
96
+ "epoch": 3.024,
97
+ "grad_norm": 13.376359939575195,
98
+ "learning_rate": 4.768683274021353e-05,
99
+ "loss": 0.2327,
100
  "step": 90
101
  },
102
  {
103
+ "epoch": 3.04,
104
+ "grad_norm": 0.04108860716223717,
105
+ "learning_rate": 4.6797153024911034e-05,
106
+ "loss": 0.0413,
107
+ "step": 100
 
 
108
  },
109
  {
110
+ "epoch": 3.04,
111
+ "eval_accuracy": 0.9090909090909091,
112
+ "eval_loss": 0.3500887453556061,
113
+ "eval_runtime": 70.4068,
114
+ "eval_samples_per_second": 0.469,
115
+ "eval_steps_per_second": 0.071,
116
  "step": 100
117
  },
118
  {
119
+ "epoch": 4.016,
120
+ "grad_norm": 0.014622594229876995,
121
+ "learning_rate": 4.590747330960855e-05,
122
+ "loss": 0.0045,
123
  "step": 110
124
  },
125
  {
126
+ "epoch": 4.032,
127
+ "grad_norm": 0.011623167432844639,
128
+ "learning_rate": 4.501779359430605e-05,
129
+ "loss": 0.1179,
130
  "step": 120
131
  },
132
  {
133
+ "epoch": 4.04,
134
+ "eval_accuracy": 1.0,
135
+ "eval_loss": 0.011436140164732933,
136
+ "eval_runtime": 70.1018,
137
+ "eval_samples_per_second": 0.471,
138
+ "eval_steps_per_second": 0.071,
139
+ "step": 125
140
  },
141
  {
142
+ "epoch": 5.008,
143
+ "grad_norm": 0.16675230860710144,
144
+ "learning_rate": 4.4128113879003566e-05,
145
+ "loss": 0.0699,
146
  "step": 130
147
  },
148
  {
149
+ "epoch": 5.024,
150
+ "grad_norm": 0.1757970154285431,
151
+ "learning_rate": 4.3238434163701066e-05,
152
+ "loss": 0.069,
153
  "step": 140
154
  },
155
  {
156
+ "epoch": 5.04,
157
+ "grad_norm": 0.02306324616074562,
158
+ "learning_rate": 4.234875444839858e-05,
159
+ "loss": 0.0921,
160
  "step": 150
161
  },
162
  {
163
+ "epoch": 5.04,
164
+ "eval_accuracy": 0.9696969696969697,
165
+ "eval_loss": 0.08965646475553513,
166
+ "eval_runtime": 58.4708,
167
+ "eval_samples_per_second": 0.564,
168
+ "eval_steps_per_second": 0.086,
169
+ "step": 150
170
  },
171
  {
172
+ "epoch": 6.016,
173
+ "grad_norm": 0.05490000173449516,
174
+ "learning_rate": 4.1459074733096085e-05,
175
+ "loss": 0.2398,
176
  "step": 160
177
  },
178
  {
179
+ "epoch": 6.032,
180
+ "grad_norm": 82.93873596191406,
181
+ "learning_rate": 4.05693950177936e-05,
182
+ "loss": 0.0557,
183
  "step": 170
184
  },
185
  {
186
+ "epoch": 6.04,
187
+ "eval_accuracy": 1.0,
188
+ "eval_loss": 0.0020419044885784388,
189
+ "eval_runtime": 66.481,
190
+ "eval_samples_per_second": 0.496,
191
+ "eval_steps_per_second": 0.075,
192
+ "step": 175
193
  },
194
  {
195
+ "epoch": 7.008,
196
+ "grad_norm": 0.34412676095962524,
197
+ "learning_rate": 3.9679715302491105e-05,
198
+ "loss": 0.2047,
199
+ "step": 180
 
 
200
  },
201
  {
202
+ "epoch": 7.024,
203
+ "grad_norm": 2.353379249572754,
204
+ "learning_rate": 3.879003558718861e-05,
205
+ "loss": 0.038,
206
  "step": 190
207
  },
208
  {
209
+ "epoch": 7.04,
210
+ "grad_norm": 0.10385355353355408,
211
+ "learning_rate": 3.7900355871886125e-05,
212
+ "loss": 0.0366,
213
  "step": 200
214
  },
215
  {
216
+ "epoch": 7.04,
217
+ "eval_accuracy": 1.0,
218
+ "eval_loss": 0.0015560887986794114,
219
+ "eval_runtime": 65.9693,
220
+ "eval_samples_per_second": 0.5,
221
+ "eval_steps_per_second": 0.076,
222
+ "step": 200
223
  },
224
  {
225
+ "epoch": 8.016,
226
+ "grad_norm": 0.23118802905082703,
227
+ "learning_rate": 3.701067615658363e-05,
228
+ "loss": 0.2039,
229
+ "step": 210
 
 
230
  },
231
  {
232
+ "epoch": 8.032,
233
+ "grad_norm": 0.016625450924038887,
234
+ "learning_rate": 3.6120996441281144e-05,
235
+ "loss": 0.0667,
236
  "step": 220
237
  },
238
  {
239
+ "epoch": 8.04,
240
+ "eval_accuracy": 0.9696969696969697,
241
+ "eval_loss": 0.06851887702941895,
242
+ "eval_runtime": 59.1794,
243
+ "eval_samples_per_second": 0.558,
244
+ "eval_steps_per_second": 0.084,
245
+ "step": 225
246
+ },
247
+ {
248
+ "epoch": 9.008,
249
+ "grad_norm": 0.011781550943851471,
250
+ "learning_rate": 3.5231316725978644e-05,
251
+ "loss": 0.0016,
252
  "step": 230
253
  },
254
  {
255
+ "epoch": 9.024,
256
+ "grad_norm": 0.008836457505822182,
257
+ "learning_rate": 3.434163701067616e-05,
258
+ "loss": 0.0037,
259
  "step": 240
260
  },
261
  {
262
+ "epoch": 9.04,
263
+ "grad_norm": 0.0074387528002262115,
264
+ "learning_rate": 3.345195729537366e-05,
265
+ "loss": 0.1505,
266
+ "step": 250
 
 
267
  },
268
  {
269
+ "epoch": 9.04,
270
+ "eval_accuracy": 0.9696969696969697,
271
+ "eval_loss": 0.12975242733955383,
272
+ "eval_runtime": 66.9783,
273
+ "eval_samples_per_second": 0.493,
274
+ "eval_steps_per_second": 0.075,
275
  "step": 250
276
  },
277
  {
278
+ "epoch": 10.016,
279
+ "grad_norm": 0.030738357454538345,
280
+ "learning_rate": 3.2562277580071177e-05,
281
+ "loss": 0.081,
282
  "step": 260
283
  },
284
  {
285
+ "epoch": 10.032,
286
+ "grad_norm": 0.011689607053995132,
287
+ "learning_rate": 3.167259786476868e-05,
288
+ "loss": 0.0589,
289
  "step": 270
290
  },
291
  {
292
+ "epoch": 10.04,
293
+ "eval_accuracy": 1.0,
294
+ "eval_loss": 0.005353024695068598,
295
+ "eval_runtime": 64.1994,
296
+ "eval_samples_per_second": 0.514,
297
+ "eval_steps_per_second": 0.078,
298
+ "step": 275
299
+ },
300
+ {
301
+ "epoch": 11.008,
302
+ "grad_norm": 4.4128546714782715,
303
+ "learning_rate": 3.0782918149466196e-05,
304
+ "loss": 0.0977,
305
  "step": 280
306
  },
307
  {
308
+ "epoch": 11.024,
309
+ "grad_norm": 0.013472197577357292,
310
+ "learning_rate": 2.98932384341637e-05,
311
+ "loss": 0.0509,
312
  "step": 290
313
  },
314
  {
315
+ "epoch": 11.04,
316
+ "grad_norm": 0.01684253290295601,
317
+ "learning_rate": 2.9003558718861212e-05,
318
+ "loss": 0.0669,
319
  "step": 300
320
  },
321
  {
322
+ "epoch": 11.04,
323
+ "eval_accuracy": 0.9696969696969697,
324
+ "eval_loss": 0.026497945189476013,
325
+ "eval_runtime": 114.7091,
326
+ "eval_samples_per_second": 0.288,
327
+ "eval_steps_per_second": 0.044,
328
+ "step": 300
329
  },
330
  {
331
+ "epoch": 12.016,
332
+ "grad_norm": 42.653953552246094,
333
+ "learning_rate": 2.811387900355872e-05,
334
+ "loss": 0.0402,
 
 
335
  "step": 310
336
  },
337
  {
338
+ "epoch": 12.032,
339
+ "grad_norm": 0.9748566150665283,
340
+ "learning_rate": 2.7224199288256232e-05,
341
+ "loss": 0.0094,
342
  "step": 320
343
  },
344
  {
345
+ "epoch": 12.04,
346
+ "eval_accuracy": 0.9696969696969697,
347
+ "eval_loss": 0.12569771707057953,
348
+ "eval_runtime": 67.8328,
349
+ "eval_samples_per_second": 0.486,
350
+ "eval_steps_per_second": 0.074,
351
+ "step": 325
352
+ },
353
+ {
354
+ "epoch": 13.008,
355
+ "grad_norm": 0.06350671499967575,
356
+ "learning_rate": 2.6334519572953735e-05,
357
+ "loss": 0.0117,
358
  "step": 330
359
  },
360
  {
361
+ "epoch": 13.024,
362
+ "grad_norm": 0.02096625417470932,
363
+ "learning_rate": 2.5444839857651248e-05,
364
+ "loss": 0.0003,
365
  "step": 340
366
  },
367
  {
368
+ "epoch": 13.04,
369
+ "grad_norm": 13.891782760620117,
370
+ "learning_rate": 2.4555160142348754e-05,
371
+ "loss": 0.1737,
372
+ "step": 350
 
 
373
  },
374
  {
375
+ "epoch": 13.04,
376
+ "eval_accuracy": 1.0,
377
+ "eval_loss": 0.00027246223180554807,
378
+ "eval_runtime": 66.3634,
379
+ "eval_samples_per_second": 0.497,
380
+ "eval_steps_per_second": 0.075,
381
  "step": 350
382
  },
383
  {
384
+ "epoch": 14.016,
385
+ "grad_norm": 0.00640275189653039,
386
+ "learning_rate": 2.3665480427046264e-05,
387
+ "loss": 0.0056,
388
  "step": 360
389
  },
390
  {
391
+ "epoch": 14.032,
392
+ "grad_norm": 0.2554427683353424,
393
+ "learning_rate": 2.277580071174377e-05,
394
+ "loss": 0.0775,
395
  "step": 370
396
  },
397
  {
398
+ "epoch": 14.04,
399
+ "eval_accuracy": 1.0,
400
+ "eval_loss": 0.0005332967266440392,
401
+ "eval_runtime": 57.622,
402
+ "eval_samples_per_second": 0.573,
403
+ "eval_steps_per_second": 0.087,
404
+ "step": 375
405
  },
406
  {
407
+ "epoch": 15.008,
408
+ "grad_norm": 0.010061250999569893,
409
+ "learning_rate": 2.188612099644128e-05,
410
+ "loss": 0.0076,
411
  "step": 380
412
  },
413
  {
414
+ "epoch": 15.024,
415
+ "grad_norm": 0.009161919355392456,
416
+ "learning_rate": 2.099644128113879e-05,
417
+ "loss": 0.0775,
418
  "step": 390
419
  },
420
  {
421
+ "epoch": 15.04,
422
+ "grad_norm": 0.030138863250613213,
423
+ "learning_rate": 2.01067615658363e-05,
424
+ "loss": 0.0805,
425
  "step": 400
426
  },
427
  {
428
+ "epoch": 15.04,
429
+ "eval_accuracy": 1.0,
430
+ "eval_loss": 0.0005667555378749967,
431
+ "eval_runtime": 65.0511,
432
+ "eval_samples_per_second": 0.507,
433
+ "eval_steps_per_second": 0.077,
434
+ "step": 400
435
  },
436
  {
437
+ "epoch": 16.016,
438
+ "grad_norm": 0.02383618988096714,
439
+ "learning_rate": 1.9217081850533806e-05,
440
+ "loss": 0.0705,
441
  "step": 410
442
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  {
444
  "epoch": 16.032,
445
+ "grad_norm": 0.027891239151358604,
446
+ "learning_rate": 1.8327402135231316e-05,
447
+ "loss": 0.0035,
448
+ "step": 420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  },
450
  {
451
+ "epoch": 16.04,
452
+ "eval_accuracy": 1.0,
453
+ "eval_loss": 0.0008408486610278487,
454
+ "eval_runtime": 66.7117,
455
+ "eval_samples_per_second": 0.495,
456
  "eval_steps_per_second": 0.075,
457
+ "step": 425
458
  }
459
  ],
460
  "logging_steps": 10,
461
+ "max_steps": 625,
462
  "num_input_tokens_seen": 0,
463
  "num_train_epochs": 9223372036854775807,
464
  "save_steps": 500,
 
469
  "should_evaluate": false,
470
  "should_log": false,
471
  "should_save": true,
472
+ "should_training_stop": false
473
  },
474
  "attributes": {}
475
  }
476
  },
477
+ "total_flos": 4.2366196042039296e+18,
478
  "train_batch_size": 8,
479
  "trial_name": null,
480
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f01a4e4c46eb22acc5c0feecd25612c1b333942e9e4dc5017ee55a13f9d34337
3
- size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeeec3c2e5d7c9acdb45dfc40a15b0dfa96eadede48209631aad893ee4584cea
3
+ size 5841