SodaXII commited on
Commit
0498682
·
verified ·
1 Parent(s): c343474

Model save

Browse files
README.md CHANGED
@@ -18,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [WinKawaks/vit-tiny-patch16-224](https://huggingface.co/WinKawaks/vit-tiny-patch16-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.3269
22
- - Accuracy: 0.9228
23
 
24
  ## Model description
25
 
@@ -45,43 +45,73 @@ The following hyperparameters were used during training:
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine_with_restarts
47
  - lr_scheduler_warmup_steps: 256
48
- - num_epochs: 15
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
- |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
- | 2.0564 | 0.5 | 64 | 1.4541 | 0.4899 |
56
- | 1.0767 | 1.0 | 128 | 0.6909 | 0.7651 |
57
- | 0.4917 | 1.5 | 192 | 0.4307 | 0.8322 |
58
- | 0.285 | 2.0 | 256 | 0.2932 | 0.9027 |
59
- | 0.0902 | 2.5 | 320 | 0.3134 | 0.8993 |
60
- | 0.0588 | 3.0 | 384 | 0.3076 | 0.9161 |
61
- | 0.0155 | 3.5 | 448 | 0.2627 | 0.9396 |
62
- | 0.0066 | 4.0 | 512 | 0.2992 | 0.9295 |
63
- | 0.0017 | 4.5 | 576 | 0.2936 | 0.9228 |
64
- | 0.0009 | 5.0 | 640 | 0.2961 | 0.9228 |
65
- | 0.0006 | 5.5 | 704 | 0.3005 | 0.9228 |
66
- | 0.0005 | 6.0 | 768 | 0.3004 | 0.9228 |
67
- | 0.0005 | 6.5 | 832 | 0.2867 | 0.9262 |
68
- | 0.0004 | 7.0 | 896 | 0.2977 | 0.9295 |
69
- | 0.0003 | 7.5 | 960 | 0.2944 | 0.9295 |
70
- | 0.0002 | 8.0 | 1024 | 0.3074 | 0.9295 |
71
- | 0.0002 | 8.5 | 1088 | 0.3053 | 0.9329 |
72
- | 0.0002 | 9.0 | 1152 | 0.3098 | 0.9295 |
73
- | 0.0001 | 9.5 | 1216 | 0.3102 | 0.9295 |
74
- | 0.0001 | 10.0 | 1280 | 0.3105 | 0.9262 |
75
- | 0.0001 | 10.5 | 1344 | 0.3105 | 0.9262 |
76
- | 0.0001 | 11.0 | 1408 | 0.3202 | 0.9262 |
77
- | 0.0001 | 11.5 | 1472 | 0.3183 | 0.9295 |
78
- | 0.0001 | 12.0 | 1536 | 0.3131 | 0.9329 |
79
- | 0.0001 | 12.5 | 1600 | 0.3157 | 0.9295 |
80
- | 0.0001 | 13.0 | 1664 | 0.3238 | 0.9228 |
81
- | 0.0001 | 13.5 | 1728 | 0.3220 | 0.9228 |
82
- | 0.0001 | 14.0 | 1792 | 0.3266 | 0.9228 |
83
- | 0.0001 | 14.5 | 1856 | 0.3274 | 0.9228 |
84
- | 0.0001 | 15.0 | 1920 | 0.3269 | 0.9228 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
 
87
  ### Framework versions
@@ -89,4 +119,4 @@ The following hyperparameters were used during training:
89
  - Transformers 4.48.3
90
  - Pytorch 2.5.1+cu124
91
  - Datasets 3.3.2
92
- - Tokenizers 0.21.0
 
18
 
19
  This model is a fine-tuned version of [WinKawaks/vit-tiny-patch16-224](https://huggingface.co/WinKawaks/vit-tiny-patch16-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.3674
22
+ - Accuracy: 0.9262
23
 
24
  ## Model description
25
 
 
45
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine_with_restarts
47
  - lr_scheduler_warmup_steps: 256
48
+ - num_epochs: 30
49
  - mixed_precision_training: Native AMP
50
 
51
  ### Training results
52
 
53
+ | Training Loss | Epoch | Step | Accuracy | Validation Loss |
54
+ |:-------------:|:-----:|:----:|:--------:|:---------------:|
55
+ | 2.0564 | 0.5 | 64 | 0.4899 | 1.4541 |
56
+ | 1.0767 | 1.0 | 128 | 0.7651 | 0.6909 |
57
+ | 0.4917 | 1.5 | 192 | 0.8322 | 0.4307 |
58
+ | 0.285 | 2.0 | 256 | 0.9027 | 0.2932 |
59
+ | 0.0902 | 2.5 | 320 | 0.8993 | 0.3134 |
60
+ | 0.0588 | 3.0 | 384 | 0.9161 | 0.3076 |
61
+ | 0.0155 | 3.5 | 448 | 0.9396 | 0.2627 |
62
+ | 0.0066 | 4.0 | 512 | 0.9295 | 0.2992 |
63
+ | 0.0017 | 4.5 | 576 | 0.9228 | 0.2936 |
64
+ | 0.0009 | 5.0 | 640 | 0.9228 | 0.2961 |
65
+ | 0.0006 | 5.5 | 704 | 0.9228 | 0.3005 |
66
+ | 0.0005 | 6.0 | 768 | 0.9228 | 0.3004 |
67
+ | 0.0005 | 6.5 | 832 | 0.9262 | 0.2867 |
68
+ | 0.0004 | 7.0 | 896 | 0.9295 | 0.2977 |
69
+ | 0.0003 | 7.5 | 960 | 0.9295 | 0.2944 |
70
+ | 0.0002 | 8.0 | 1024 | 0.9295 | 0.3074 |
71
+ | 0.0002 | 8.5 | 1088 | 0.9329 | 0.3053 |
72
+ | 0.0002 | 9.0 | 1152 | 0.9295 | 0.3098 |
73
+ | 0.0001 | 9.5 | 1216 | 0.9295 | 0.3102 |
74
+ | 0.0001 | 10.0 | 1280 | 0.9262 | 0.3105 |
75
+ | 0.0001 | 10.5 | 1344 | 0.9262 | 0.3105 |
76
+ | 0.0001 | 11.0 | 1408 | 0.9262 | 0.3202 |
77
+ | 0.0001 | 11.5 | 1472 | 0.9295 | 0.3183 |
78
+ | 0.0001 | 12.0 | 1536 | 0.9329 | 0.3131 |
79
+ | 0.0001 | 12.5 | 1600 | 0.9295 | 0.3157 |
80
+ | 0.0001 | 13.0 | 1664 | 0.9228 | 0.3238 |
81
+ | 0.0001 | 13.5 | 1728 | 0.9228 | 0.3220 |
82
+ | 0.0001 | 14.0 | 1792 | 0.9228 | 0.3266 |
83
+ | 0.0001 | 14.5 | 1856 | 0.9228 | 0.3274 |
84
+ | 0.0001 | 15.0 | 1920 | 0.9228 | 0.3269 |
85
+ | 0.0001 | 15.5 | 1984 | 0.3267 | 0.9262 |
86
+ | 0.0001 | 16.0 | 2048 | 0.3298 | 0.9228 |
87
+ | 0.0001 | 16.5 | 2112 | 0.3330 | 0.9228 |
88
+ | 0.0001 | 17.0 | 2176 | 0.3337 | 0.9228 |
89
+ | 0.0001 | 17.5 | 2240 | 0.3337 | 0.9228 |
90
+ | 0.0001 | 18.0 | 2304 | 0.3355 | 0.9228 |
91
+ | 0.0 | 18.5 | 2368 | 0.3346 | 0.9228 |
92
+ | 0.0 | 19.0 | 2432 | 0.3360 | 0.9228 |
93
+ | 0.0 | 19.5 | 2496 | 0.3368 | 0.9228 |
94
+ | 0.0 | 20.0 | 2560 | 0.3365 | 0.9228 |
95
+ | 0.0 | 20.5 | 2624 | 0.3364 | 0.9228 |
96
+ | 0.0 | 21.0 | 2688 | 0.3412 | 0.9228 |
97
+ | 0.0 | 21.5 | 2752 | 0.3414 | 0.9228 |
98
+ | 0.0 | 22.0 | 2816 | 0.3435 | 0.9262 |
99
+ | 0.0 | 22.5 | 2880 | 0.3557 | 0.9228 |
100
+ | 0.0 | 23.0 | 2944 | 0.3490 | 0.9295 |
101
+ | 0.0 | 23.5 | 3008 | 0.3564 | 0.9262 |
102
+ | 0.0 | 24.0 | 3072 | 0.3545 | 0.9295 |
103
+ | 0.0 | 24.5 | 3136 | 0.3577 | 0.9262 |
104
+ | 0.0 | 25.0 | 3200 | 0.3597 | 0.9262 |
105
+ | 0.0 | 25.5 | 3264 | 0.3632 | 0.9262 |
106
+ | 0.0 | 26.0 | 3328 | 0.3627 | 0.9262 |
107
+ | 0.0 | 26.5 | 3392 | 0.3650 | 0.9262 |
108
+ | 0.0 | 27.0 | 3456 | 0.3664 | 0.9262 |
109
+ | 0.0 | 27.5 | 3520 | 0.3664 | 0.9262 |
110
+ | 0.0 | 28.0 | 3584 | 0.3666 | 0.9262 |
111
+ | 0.0 | 28.5 | 3648 | 0.3666 | 0.9262 |
112
+ | 0.0 | 29.0 | 3712 | 0.3670 | 0.9262 |
113
+ | 0.0 | 29.5 | 3776 | 0.3673 | 0.9262 |
114
+ | 0.0 | 30.0 | 3840 | 0.3674 | 0.9262 |
115
 
116
 
117
  ### Framework versions
 
119
  - Transformers 4.48.3
120
  - Pytorch 2.5.1+cu124
121
  - Datasets 3.3.2
122
+ - Tokenizers 0.21.1
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 15.0,
3
+ "total_flos": 6.132781352484864e+17,
4
+ "train_loss": 0.1362585227402936,
5
+ "train_runtime": 4195.4959,
6
+ "train_samples_per_second": 29.289,
7
+ "train_steps_per_second": 0.458
8
+ }
logs/events.out.tfevents.1743190651.4955dc82343d.1607.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c7af3195b7de6bb3689f57bca3872a46b11102d5d98ff9a890b9cc3db0c36ba
3
+ size 22083
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 15.0,
3
+ "total_flos": 6.132781352484864e+17,
4
+ "train_loss": 0.1362585227402936,
5
+ "train_runtime": 4195.4959,
6
+ "train_samples_per_second": 29.289,
7
+ "train_steps_per_second": 0.458
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2626972794532776,
3
+ "best_model_checkpoint": "./drive/Shareddrives/CS198-Drones/[v4] Training Output/vit-tiny-patch16-224_rice-leaf-disease-augmented-v4_fft/checkpoint-448",
4
+ "epoch": 15.0,
5
+ "eval_steps": 64,
6
+ "global_step": 1920,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.5,
13
+ "grad_norm": 9.17007064819336,
14
+ "learning_rate": 7.265625e-06,
15
+ "loss": 2.0564,
16
+ "step": 64
17
+ },
18
+ {
19
+ "epoch": 0.5,
20
+ "eval_accuracy": 0.4899328859060403,
21
+ "eval_loss": 1.4541155099868774,
22
+ "eval_runtime": 8.722,
23
+ "eval_samples_per_second": 34.167,
24
+ "eval_steps_per_second": 0.573,
25
+ "step": 64
26
+ },
27
+ {
28
+ "epoch": 1.0,
29
+ "grad_norm": 13.332218170166016,
30
+ "learning_rate": 1.4765625e-05,
31
+ "loss": 1.0767,
32
+ "step": 128
33
+ },
34
+ {
35
+ "epoch": 1.0,
36
+ "eval_accuracy": 0.7651006711409396,
37
+ "eval_loss": 0.6909474730491638,
38
+ "eval_runtime": 8.6475,
39
+ "eval_samples_per_second": 34.461,
40
+ "eval_steps_per_second": 0.578,
41
+ "step": 128
42
+ },
43
+ {
44
+ "epoch": 1.5,
45
+ "grad_norm": 9.764442443847656,
46
+ "learning_rate": 2.2265625e-05,
47
+ "loss": 0.4917,
48
+ "step": 192
49
+ },
50
+ {
51
+ "epoch": 1.5,
52
+ "eval_accuracy": 0.8322147651006712,
53
+ "eval_loss": 0.4307171106338501,
54
+ "eval_runtime": 8.6026,
55
+ "eval_samples_per_second": 34.641,
56
+ "eval_steps_per_second": 0.581,
57
+ "step": 192
58
+ },
59
+ {
60
+ "epoch": 2.0,
61
+ "grad_norm": 9.665606498718262,
62
+ "learning_rate": 2.9765625e-05,
63
+ "loss": 0.285,
64
+ "step": 256
65
+ },
66
+ {
67
+ "epoch": 2.0,
68
+ "eval_accuracy": 0.9026845637583892,
69
+ "eval_loss": 0.2932307720184326,
70
+ "eval_runtime": 7.7763,
71
+ "eval_samples_per_second": 38.322,
72
+ "eval_steps_per_second": 0.643,
73
+ "step": 256
74
+ },
75
+ {
76
+ "epoch": 2.5,
77
+ "grad_norm": 8.143477439880371,
78
+ "learning_rate": 2.9084596206825315e-05,
79
+ "loss": 0.0902,
80
+ "step": 320
81
+ },
82
+ {
83
+ "epoch": 2.5,
84
+ "eval_accuracy": 0.8993288590604027,
85
+ "eval_loss": 0.31344401836395264,
86
+ "eval_runtime": 7.522,
87
+ "eval_samples_per_second": 39.617,
88
+ "eval_steps_per_second": 0.665,
89
+ "step": 320
90
+ },
91
+ {
92
+ "epoch": 3.0,
93
+ "grad_norm": 2.6106486320495605,
94
+ "learning_rate": 2.633961484257573e-05,
95
+ "loss": 0.0588,
96
+ "step": 384
97
+ },
98
+ {
99
+ "epoch": 3.0,
100
+ "eval_accuracy": 0.9161073825503355,
101
+ "eval_loss": 0.3075862526893616,
102
+ "eval_runtime": 8.3334,
103
+ "eval_samples_per_second": 35.76,
104
+ "eval_steps_per_second": 0.6,
105
+ "step": 384
106
+ },
107
+ {
108
+ "epoch": 3.5,
109
+ "grad_norm": 1.9543571472167969,
110
+ "learning_rate": 2.212085192038453e-05,
111
+ "loss": 0.0155,
112
+ "step": 448
113
+ },
114
+ {
115
+ "epoch": 3.5,
116
+ "eval_accuracy": 0.9395973154362416,
117
+ "eval_loss": 0.2626972794532776,
118
+ "eval_runtime": 8.3127,
119
+ "eval_samples_per_second": 35.849,
120
+ "eval_steps_per_second": 0.601,
121
+ "step": 448
122
+ },
123
+ {
124
+ "epoch": 4.0,
125
+ "grad_norm": 0.38745102286338806,
126
+ "learning_rate": 1.6976609572058592e-05,
127
+ "loss": 0.0066,
128
+ "step": 512
129
+ },
130
+ {
131
+ "epoch": 4.0,
132
+ "eval_accuracy": 0.9295302013422819,
133
+ "eval_loss": 0.299156129360199,
134
+ "eval_runtime": 7.2879,
135
+ "eval_samples_per_second": 40.89,
136
+ "eval_steps_per_second": 0.686,
137
+ "step": 512
138
+ },
139
+ {
140
+ "epoch": 4.5,
141
+ "grad_norm": 0.10160534083843231,
142
+ "learning_rate": 1.1575472190259976e-05,
143
+ "loss": 0.0017,
144
+ "step": 576
145
+ },
146
+ {
147
+ "epoch": 4.5,
148
+ "eval_accuracy": 0.9228187919463087,
149
+ "eval_loss": 0.2935960590839386,
150
+ "eval_runtime": 8.4642,
151
+ "eval_samples_per_second": 35.207,
152
+ "eval_steps_per_second": 0.591,
153
+ "step": 576
154
+ },
155
+ {
156
+ "epoch": 5.0,
157
+ "grad_norm": 0.13512970507144928,
158
+ "learning_rate": 6.619412176671753e-06,
159
+ "loss": 0.0009,
160
+ "step": 640
161
+ },
162
+ {
163
+ "epoch": 5.0,
164
+ "eval_accuracy": 0.9228187919463087,
165
+ "eval_loss": 0.29606831073760986,
166
+ "eval_runtime": 8.1466,
167
+ "eval_samples_per_second": 36.58,
168
+ "eval_steps_per_second": 0.614,
169
+ "step": 640
170
+ },
171
+ {
172
+ "epoch": 5.5,
173
+ "grad_norm": 0.08882313966751099,
174
+ "learning_rate": 2.7525563336129812e-06,
175
+ "loss": 0.0006,
176
+ "step": 704
177
+ },
178
+ {
179
+ "epoch": 5.5,
180
+ "eval_accuracy": 0.9228187919463087,
181
+ "eval_loss": 0.30046388506889343,
182
+ "eval_runtime": 7.478,
183
+ "eval_samples_per_second": 39.85,
184
+ "eval_steps_per_second": 0.669,
185
+ "step": 704
186
+ },
187
+ {
188
+ "epoch": 6.0,
189
+ "grad_norm": 0.08427230268716812,
190
+ "learning_rate": 4.774703044353035e-07,
191
+ "loss": 0.0005,
192
+ "step": 768
193
+ },
194
+ {
195
+ "epoch": 6.0,
196
+ "eval_accuracy": 0.9228187919463087,
197
+ "eval_loss": 0.3003771901130676,
198
+ "eval_runtime": 8.3653,
199
+ "eval_samples_per_second": 35.623,
200
+ "eval_steps_per_second": 0.598,
201
+ "step": 768
202
+ },
203
+ {
204
+ "epoch": 6.5,
205
+ "grad_norm": 0.03683856502175331,
206
+ "learning_rate": 2.9910158634081504e-05,
207
+ "loss": 0.0005,
208
+ "step": 832
209
+ },
210
+ {
211
+ "epoch": 6.5,
212
+ "eval_accuracy": 0.9261744966442953,
213
+ "eval_loss": 0.2867479920387268,
214
+ "eval_runtime": 7.215,
215
+ "eval_samples_per_second": 41.303,
216
+ "eval_steps_per_second": 0.693,
217
+ "step": 832
218
+ },
219
+ {
220
+ "epoch": 7.0,
221
+ "grad_norm": 0.04806307703256607,
222
+ "learning_rate": 2.8359951312200077e-05,
223
+ "loss": 0.0004,
224
+ "step": 896
225
+ },
226
+ {
227
+ "epoch": 7.0,
228
+ "eval_accuracy": 0.9295302013422819,
229
+ "eval_loss": 0.2976870834827423,
230
+ "eval_runtime": 8.3401,
231
+ "eval_samples_per_second": 35.731,
232
+ "eval_steps_per_second": 0.6,
233
+ "step": 896
234
+ },
235
+ {
236
+ "epoch": 7.5,
237
+ "grad_norm": 0.033956822007894516,
238
+ "learning_rate": 2.5073384322705278e-05,
239
+ "loss": 0.0003,
240
+ "step": 960
241
+ },
242
+ {
243
+ "epoch": 7.5,
244
+ "eval_accuracy": 0.9295302013422819,
245
+ "eval_loss": 0.2943709194660187,
246
+ "eval_runtime": 7.6673,
247
+ "eval_samples_per_second": 38.867,
248
+ "eval_steps_per_second": 0.652,
249
+ "step": 960
250
+ },
251
+ {
252
+ "epoch": 8.0,
253
+ "grad_norm": 0.031262967735528946,
254
+ "learning_rate": 2.0477604608884026e-05,
255
+ "loss": 0.0002,
256
+ "step": 1024
257
+ },
258
+ {
259
+ "epoch": 8.0,
260
+ "eval_accuracy": 0.9295302013422819,
261
+ "eval_loss": 0.30740392208099365,
262
+ "eval_runtime": 8.2818,
263
+ "eval_samples_per_second": 35.982,
264
+ "eval_steps_per_second": 0.604,
265
+ "step": 1024
266
+ },
267
+ {
268
+ "epoch": 8.5,
269
+ "grad_norm": 0.01630540005862713,
270
+ "learning_rate": 1.516991423792483e-05,
271
+ "loss": 0.0002,
272
+ "step": 1088
273
+ },
274
+ {
275
+ "epoch": 8.5,
276
+ "eval_accuracy": 0.9328859060402684,
277
+ "eval_loss": 0.3053071200847626,
278
+ "eval_runtime": 8.2512,
279
+ "eval_samples_per_second": 36.116,
280
+ "eval_steps_per_second": 0.606,
281
+ "step": 1088
282
+ },
283
+ {
284
+ "epoch": 9.0,
285
+ "grad_norm": 0.016130153089761734,
286
+ "learning_rate": 9.840140535762432e-06,
287
+ "loss": 0.0002,
288
+ "step": 1152
289
+ },
290
+ {
291
+ "epoch": 9.0,
292
+ "eval_accuracy": 0.9295302013422819,
293
+ "eval_loss": 0.3097546696662903,
294
+ "eval_runtime": 8.418,
295
+ "eval_samples_per_second": 35.4,
296
+ "eval_steps_per_second": 0.594,
297
+ "step": 1152
298
+ },
299
+ {
300
+ "epoch": 9.5,
301
+ "grad_norm": 0.014978409744799137,
302
+ "learning_rate": 5.180980944002794e-06,
303
+ "loss": 0.0001,
304
+ "step": 1216
305
+ },
306
+ {
307
+ "epoch": 9.5,
308
+ "eval_accuracy": 0.9295302013422819,
309
+ "eval_loss": 0.310248464345932,
310
+ "eval_runtime": 7.2498,
311
+ "eval_samples_per_second": 41.105,
312
+ "eval_steps_per_second": 0.69,
313
+ "step": 1216
314
+ },
315
+ {
316
+ "epoch": 10.0,
317
+ "grad_norm": 0.013572459109127522,
318
+ "learning_rate": 1.7979748550475833e-06,
319
+ "loss": 0.0001,
320
+ "step": 1280
321
+ },
322
+ {
323
+ "epoch": 10.0,
324
+ "eval_accuracy": 0.9261744966442953,
325
+ "eval_loss": 0.3105408847332001,
326
+ "eval_runtime": 8.3297,
327
+ "eval_samples_per_second": 35.776,
328
+ "eval_steps_per_second": 0.6,
329
+ "step": 1280
330
+ },
331
+ {
332
+ "epoch": 10.5,
333
+ "grad_norm": 0.010590254329144955,
334
+ "learning_rate": 1.3080316225364152e-07,
335
+ "loss": 0.0001,
336
+ "step": 1344
337
+ },
338
+ {
339
+ "epoch": 10.5,
340
+ "eval_accuracy": 0.9261744966442953,
341
+ "eval_loss": 0.3105214238166809,
342
+ "eval_runtime": 7.2656,
343
+ "eval_samples_per_second": 41.015,
344
+ "eval_steps_per_second": 0.688,
345
+ "step": 1344
346
+ },
347
+ {
348
+ "epoch": 11.0,
349
+ "grad_norm": 0.011299003846943378,
350
+ "learning_rate": 2.9603855973006482e-05,
351
+ "loss": 0.0001,
352
+ "step": 1408
353
+ },
354
+ {
355
+ "epoch": 11.0,
356
+ "eval_accuracy": 0.9261744966442953,
357
+ "eval_loss": 0.32019779086112976,
358
+ "eval_runtime": 8.1649,
359
+ "eval_samples_per_second": 36.498,
360
+ "eval_steps_per_second": 0.612,
361
+ "step": 1408
362
+ },
363
+ {
364
+ "epoch": 11.5,
365
+ "grad_norm": 0.016378453001379967,
366
+ "learning_rate": 2.7440488243452587e-05,
367
+ "loss": 0.0001,
368
+ "step": 1472
369
+ },
370
+ {
371
+ "epoch": 11.5,
372
+ "eval_accuracy": 0.9295302013422819,
373
+ "eval_loss": 0.3183320462703705,
374
+ "eval_runtime": 8.0678,
375
+ "eval_samples_per_second": 36.937,
376
+ "eval_steps_per_second": 0.62,
377
+ "step": 1472
378
+ },
379
+ {
380
+ "epoch": 12.0,
381
+ "grad_norm": 0.007818573154509068,
382
+ "learning_rate": 2.3660261176123762e-05,
383
+ "loss": 0.0001,
384
+ "step": 1536
385
+ },
386
+ {
387
+ "epoch": 12.0,
388
+ "eval_accuracy": 0.9328859060402684,
389
+ "eval_loss": 0.3130977153778076,
390
+ "eval_runtime": 8.2508,
391
+ "eval_samples_per_second": 36.118,
392
+ "eval_steps_per_second": 0.606,
393
+ "step": 1536
394
+ },
395
+ {
396
+ "epoch": 12.5,
397
+ "grad_norm": 0.00784409698098898,
398
+ "learning_rate": 1.875448148769462e-05,
399
+ "loss": 0.0001,
400
+ "step": 1600
401
+ },
402
+ {
403
+ "epoch": 12.5,
404
+ "eval_accuracy": 0.9295302013422819,
405
+ "eval_loss": 0.3157329261302948,
406
+ "eval_runtime": 7.1968,
407
+ "eval_samples_per_second": 41.407,
408
+ "eval_steps_per_second": 0.695,
409
+ "step": 1600
410
+ },
411
+ {
412
+ "epoch": 13.0,
413
+ "grad_norm": 0.006332057528197765,
414
+ "learning_rate": 1.3360741171588585e-05,
415
+ "loss": 0.0001,
416
+ "step": 1664
417
+ },
418
+ {
419
+ "epoch": 13.0,
420
+ "eval_accuracy": 0.9228187919463087,
421
+ "eval_loss": 0.3237887918949127,
422
+ "eval_runtime": 8.2082,
423
+ "eval_samples_per_second": 36.305,
424
+ "eval_steps_per_second": 0.609,
425
+ "step": 1664
426
+ },
427
+ {
428
+ "epoch": 13.5,
429
+ "grad_norm": 0.006929404567927122,
430
+ "learning_rate": 8.180051251245103e-06,
431
+ "loss": 0.0001,
432
+ "step": 1728
433
+ },
434
+ {
435
+ "epoch": 13.5,
436
+ "eval_accuracy": 0.9228187919463087,
437
+ "eval_loss": 0.3219589591026306,
438
+ "eval_runtime": 7.2368,
439
+ "eval_samples_per_second": 41.179,
440
+ "eval_steps_per_second": 0.691,
441
+ "step": 1728
442
+ },
443
+ {
444
+ "epoch": 14.0,
445
+ "grad_norm": 0.005624026525765657,
446
+ "learning_rate": 3.885733119675617e-06,
447
+ "loss": 0.0001,
448
+ "step": 1792
449
+ },
450
+ {
451
+ "epoch": 14.0,
452
+ "eval_accuracy": 0.9228187919463087,
453
+ "eval_loss": 0.3266230821609497,
454
+ "eval_runtime": 7.1645,
455
+ "eval_samples_per_second": 41.594,
456
+ "eval_steps_per_second": 0.698,
457
+ "step": 1792
458
+ },
459
+ {
460
+ "epoch": 14.5,
461
+ "grad_norm": 0.006068665534257889,
462
+ "learning_rate": 1.0359086314671929e-06,
463
+ "loss": 0.0001,
464
+ "step": 1856
465
+ },
466
+ {
467
+ "epoch": 14.5,
468
+ "eval_accuracy": 0.9228187919463087,
469
+ "eval_loss": 0.32735249400138855,
470
+ "eval_runtime": 7.4237,
471
+ "eval_samples_per_second": 40.142,
472
+ "eval_steps_per_second": 0.674,
473
+ "step": 1856
474
+ },
475
+ {
476
+ "epoch": 15.0,
477
+ "grad_norm": 0.007146658841520548,
478
+ "learning_rate": 9.62392481628771e-10,
479
+ "loss": 0.0001,
480
+ "step": 1920
481
+ },
482
+ {
483
+ "epoch": 15.0,
484
+ "eval_accuracy": 0.9228187919463087,
485
+ "eval_loss": 0.32691627740859985,
486
+ "eval_runtime": 7.1631,
487
+ "eval_samples_per_second": 41.602,
488
+ "eval_steps_per_second": 0.698,
489
+ "step": 1920
490
+ },
491
+ {
492
+ "epoch": 15.0,
493
+ "step": 1920,
494
+ "total_flos": 6.132781352484864e+17,
495
+ "train_loss": 0.1362585227402936,
496
+ "train_runtime": 4195.4959,
497
+ "train_samples_per_second": 29.289,
498
+ "train_steps_per_second": 0.458
499
+ }
500
+ ],
501
+ "logging_steps": 64,
502
+ "max_steps": 1920,
503
+ "num_input_tokens_seen": 0,
504
+ "num_train_epochs": 15,
505
+ "save_steps": 64,
506
+ "stateful_callbacks": {
507
+ "TrainerControl": {
508
+ "args": {
509
+ "should_epoch_stop": false,
510
+ "should_evaluate": false,
511
+ "should_log": false,
512
+ "should_save": true,
513
+ "should_training_stop": true
514
+ },
515
+ "attributes": {}
516
+ }
517
+ },
518
+ "total_flos": 6.132781352484864e+17,
519
+ "train_batch_size": 64,
520
+ "trial_name": null,
521
+ "trial_params": null
522
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c48cc43f351ec35233c274136f90dffa9129d37aa5ea7648a6e73cce86a2af3
3
  size 5496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6637a4431fabd444509438423930c601c6a4ae8966189a2b6318b9bbda03930c
3
  size 5496
training_metrics.xlsx CHANGED
Binary files a/training_metrics.xlsx and b/training_metrics.xlsx differ