Augusto777 commited on
Commit
9dd0a27
·
verified ·
1 Parent(s): 8e8d6b0

End of training

Browse files
README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: microsoft/swinv2-tiny-patch4-window8-256
4
+ tags:
5
+ - generated_from_trainer
6
+ datasets:
7
+ - imagefolder
8
+ metrics:
9
+ - accuracy
10
+ model-index:
11
+ - name: swinv2-tiny-patch4-window8-256-Diabetic-Retinopathy-DA
12
+ results:
13
+ - task:
14
+ name: Image Classification
15
+ type: image-classification
16
+ dataset:
17
+ name: imagefolder
18
+ type: imagefolder
19
+ config: default
20
+ split: validation
21
+ args: default
22
+ metrics:
23
+ - name: Accuracy
24
+ type: accuracy
25
+ value: 0.8090909090909091
26
+ ---
27
+
28
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
+ should probably proofread and complete it, then remove this comment. -->
30
+
31
+ # swinv2-tiny-patch4-window8-256-Diabetic-Retinopathy-DA
32
+
33
+ This model is a fine-tuned version of [microsoft/swinv2-tiny-patch4-window8-256](https://huggingface.co/microsoft/swinv2-tiny-patch4-window8-256) on the imagefolder dataset.
34
+ It achieves the following results on the evaluation set:
35
+ - Loss: 0.6974
36
+ - Accuracy: 0.8091
37
+
38
+ ## Model description
39
+
40
+ More information needed
41
+
42
+ ## Intended uses & limitations
43
+
44
+ More information needed
45
+
46
+ ## Training and evaluation data
47
+
48
+ More information needed
49
+
50
+ ## Training procedure
51
+
52
+ ### Training hyperparameters
53
+
54
+ The following hyperparameters were used during training:
55
+ - learning_rate: 5e-05
56
+ - train_batch_size: 32
57
+ - eval_batch_size: 32
58
+ - seed: 42
59
+ - gradient_accumulation_steps: 4
60
+ - total_train_batch_size: 128
61
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
62
+ - lr_scheduler_type: linear
63
+ - lr_scheduler_warmup_ratio: 0.1
64
+ - num_epochs: 40
65
+
66
+ ### Training results
67
+
68
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
69
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
70
+ | 1.5987 | 1.0 | 23 | 1.5683 | 0.4909 |
71
+ | 1.4137 | 2.0 | 46 | 1.2639 | 0.4909 |
72
+ | 1.1988 | 3.0 | 69 | 0.8726 | 0.7636 |
73
+ | 0.8533 | 4.0 | 92 | 0.6361 | 0.7545 |
74
+ | 0.8042 | 5.0 | 115 | 0.5985 | 0.7545 |
75
+ | 0.7349 | 6.0 | 138 | 0.5943 | 0.7545 |
76
+ | 0.7003 | 7.0 | 161 | 0.5178 | 0.7636 |
77
+ | 0.6641 | 8.0 | 184 | 0.5058 | 0.7545 |
78
+ | 0.641 | 9.0 | 207 | 0.5092 | 0.7909 |
79
+ | 0.6571 | 10.0 | 230 | 0.5319 | 0.7636 |
80
+ | 0.6522 | 11.0 | 253 | 0.5726 | 0.7909 |
81
+ | 0.5659 | 12.0 | 276 | 0.5490 | 0.7727 |
82
+ | 0.5511 | 13.0 | 299 | 0.5465 | 0.8 |
83
+ | 0.5435 | 14.0 | 322 | 0.5728 | 0.7909 |
84
+ | 0.5259 | 15.0 | 345 | 0.6047 | 0.7636 |
85
+ | 0.5496 | 16.0 | 368 | 0.6479 | 0.7818 |
86
+ | 0.543 | 17.0 | 391 | 0.6040 | 0.7727 |
87
+ | 0.4646 | 18.0 | 414 | 0.6269 | 0.7818 |
88
+ | 0.4867 | 19.0 | 437 | 0.6535 | 0.7909 |
89
+ | 0.4357 | 20.0 | 460 | 0.6991 | 0.7727 |
90
+ | 0.4392 | 21.0 | 483 | 0.7127 | 0.7636 |
91
+ | 0.4403 | 22.0 | 506 | 0.6974 | 0.8091 |
92
+ | 0.4358 | 23.0 | 529 | 0.6883 | 0.7818 |
93
+ | 0.4094 | 24.0 | 552 | 0.6768 | 0.8 |
94
+ | 0.3913 | 25.0 | 575 | 0.7270 | 0.7636 |
95
+ | 0.3686 | 26.0 | 598 | 0.7104 | 0.7727 |
96
+ | 0.3679 | 27.0 | 621 | 0.7115 | 0.7818 |
97
+ | 0.378 | 28.0 | 644 | 0.8020 | 0.8091 |
98
+ | 0.3583 | 29.0 | 667 | 0.7524 | 0.7909 |
99
+ | 0.3299 | 30.0 | 690 | 0.7783 | 0.7909 |
100
+ | 0.3672 | 31.0 | 713 | 0.8193 | 0.7909 |
101
+ | 0.3567 | 32.0 | 736 | 0.8095 | 0.7909 |
102
+ | 0.3585 | 33.0 | 759 | 0.8324 | 0.7909 |
103
+ | 0.3191 | 34.0 | 782 | 0.8042 | 0.7909 |
104
+ | 0.3144 | 35.0 | 805 | 0.8189 | 0.7909 |
105
+ | 0.3452 | 36.0 | 828 | 0.8377 | 0.7909 |
106
+ | 0.3263 | 37.0 | 851 | 0.8204 | 0.7909 |
107
+ | 0.2939 | 38.0 | 874 | 0.8103 | 0.7909 |
108
+ | 0.3152 | 39.0 | 897 | 0.8184 | 0.7818 |
109
+ | 0.2787 | 40.0 | 920 | 0.8241 | 0.7818 |
110
+
111
+
112
+ ### Framework versions
113
+
114
+ - Transformers 4.36.2
115
+ - Pytorch 2.1.2+cu118
116
+ - Datasets 2.16.1
117
+ - Tokenizers 0.15.0
all_results.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_accuracy": 0.8090909090909091,
4
+ "eval_loss": 0.6974316239356995,
5
+ "eval_runtime": 0.6742,
6
+ "eval_samples_per_second": 163.163,
7
+ "eval_steps_per_second": 5.933,
8
+ "train_loss": 0.5426292188789533,
9
+ "train_runtime": 1410.8495,
10
+ "train_samples_per_second": 83.326,
11
+ "train_steps_per_second": 0.652
12
+ }
config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "microsoft/swinv2-tiny-patch4-window8-256",
3
+ "architectures": [
4
+ "Swinv2ForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "depths": [
8
+ 2,
9
+ 2,
10
+ 6,
11
+ 2
12
+ ],
13
+ "drop_path_rate": 0.1,
14
+ "embed_dim": 96,
15
+ "encoder_stride": 32,
16
+ "hidden_act": "gelu",
17
+ "hidden_dropout_prob": 0.0,
18
+ "hidden_size": 768,
19
+ "id2label": {
20
+ "0": "Mild",
21
+ "1": "Moderate",
22
+ "2": "No_DR",
23
+ "3": "Proliferate_DR",
24
+ "4": "Severe"
25
+ },
26
+ "image_size": 256,
27
+ "initializer_range": 0.02,
28
+ "label2id": {
29
+ "Mild": 0,
30
+ "Moderate": 1,
31
+ "No_DR": 2,
32
+ "Proliferate_DR": 3,
33
+ "Severe": 4
34
+ },
35
+ "layer_norm_eps": 1e-05,
36
+ "mlp_ratio": 4.0,
37
+ "model_type": "swinv2",
38
+ "num_channels": 3,
39
+ "num_heads": [
40
+ 3,
41
+ 6,
42
+ 12,
43
+ 24
44
+ ],
45
+ "num_layers": 4,
46
+ "patch_size": 4,
47
+ "path_norm": true,
48
+ "pretrained_window_sizes": [
49
+ 0,
50
+ 0,
51
+ 0,
52
+ 0
53
+ ],
54
+ "problem_type": "single_label_classification",
55
+ "qkv_bias": true,
56
+ "torch_dtype": "float32",
57
+ "transformers_version": "4.36.2",
58
+ "use_absolute_embeddings": false,
59
+ "window_size": 8
60
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "eval_accuracy": 0.8090909090909091,
4
+ "eval_loss": 0.6974316239356995,
5
+ "eval_runtime": 0.6742,
6
+ "eval_samples_per_second": 163.163,
7
+ "eval_steps_per_second": 5.933
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:895a48e889531812cac0cf571535863249257a3f00e2c2c94e6cc38547193626
3
+ size 110359372
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.485,
7
+ 0.456,
8
+ 0.406
9
+ ],
10
+ "image_processor_type": "ViTImageProcessor",
11
+ "image_std": [
12
+ 0.229,
13
+ 0.224,
14
+ 0.225
15
+ ],
16
+ "resample": 3,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 256,
20
+ "width": 256
21
+ }
22
+ }
runs/Oct12_18-07-36_DESKTOP-SKBE9FB/events.out.tfevents.1728778058.DESKTOP-SKBE9FB.19200.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e4d23023ce0cca38176c71d71a57e3e9887ebca434db45ba4c04a8a42ebf9ff
3
+ size 32545
runs/Oct12_18-07-36_DESKTOP-SKBE9FB/events.out.tfevents.1728779469.DESKTOP-SKBE9FB.19200.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7cf092e0d8e079f2830f1a9867173a87f69dd934f1e3ba76ee83c8adf6c04c17
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 40.0,
3
+ "train_loss": 0.5426292188789533,
4
+ "train_runtime": 1410.8495,
5
+ "train_samples_per_second": 83.326,
6
+ "train_steps_per_second": 0.652
7
+ }
trainer_state.json ADDED
@@ -0,0 +1,942 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8090909090909091,
3
+ "best_model_checkpoint": "swinv2-tiny-patch4-window8-256-Diabetic-Retinopathy-DA\\checkpoint-506",
4
+ "epoch": 40.0,
5
+ "eval_steps": 500,
6
+ "global_step": 920,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.43,
13
+ "learning_rate": 5.4347826086956525e-06,
14
+ "loss": 1.6086,
15
+ "step": 10
16
+ },
17
+ {
18
+ "epoch": 0.87,
19
+ "learning_rate": 1.0869565217391305e-05,
20
+ "loss": 1.5987,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 1.0,
25
+ "eval_accuracy": 0.4909090909090909,
26
+ "eval_loss": 1.5683298110961914,
27
+ "eval_runtime": 1.0793,
28
+ "eval_samples_per_second": 101.92,
29
+ "eval_steps_per_second": 3.706,
30
+ "step": 23
31
+ },
32
+ {
33
+ "epoch": 1.3,
34
+ "learning_rate": 1.630434782608696e-05,
35
+ "loss": 1.5503,
36
+ "step": 30
37
+ },
38
+ {
39
+ "epoch": 1.74,
40
+ "learning_rate": 2.173913043478261e-05,
41
+ "loss": 1.4137,
42
+ "step": 40
43
+ },
44
+ {
45
+ "epoch": 2.0,
46
+ "eval_accuracy": 0.4909090909090909,
47
+ "eval_loss": 1.263899564743042,
48
+ "eval_runtime": 0.6653,
49
+ "eval_samples_per_second": 165.342,
50
+ "eval_steps_per_second": 6.012,
51
+ "step": 46
52
+ },
53
+ {
54
+ "epoch": 2.17,
55
+ "learning_rate": 2.7173913043478262e-05,
56
+ "loss": 1.3316,
57
+ "step": 50
58
+ },
59
+ {
60
+ "epoch": 2.61,
61
+ "learning_rate": 3.260869565217392e-05,
62
+ "loss": 1.1988,
63
+ "step": 60
64
+ },
65
+ {
66
+ "epoch": 3.0,
67
+ "eval_accuracy": 0.7636363636363637,
68
+ "eval_loss": 0.8725916743278503,
69
+ "eval_runtime": 0.6051,
70
+ "eval_samples_per_second": 181.777,
71
+ "eval_steps_per_second": 6.61,
72
+ "step": 69
73
+ },
74
+ {
75
+ "epoch": 3.04,
76
+ "learning_rate": 3.804347826086957e-05,
77
+ "loss": 1.058,
78
+ "step": 70
79
+ },
80
+ {
81
+ "epoch": 3.48,
82
+ "learning_rate": 4.347826086956522e-05,
83
+ "loss": 0.9368,
84
+ "step": 80
85
+ },
86
+ {
87
+ "epoch": 3.91,
88
+ "learning_rate": 4.891304347826087e-05,
89
+ "loss": 0.8533,
90
+ "step": 90
91
+ },
92
+ {
93
+ "epoch": 4.0,
94
+ "eval_accuracy": 0.7545454545454545,
95
+ "eval_loss": 0.6361170411109924,
96
+ "eval_runtime": 0.6021,
97
+ "eval_samples_per_second": 182.679,
98
+ "eval_steps_per_second": 6.643,
99
+ "step": 92
100
+ },
101
+ {
102
+ "epoch": 4.35,
103
+ "learning_rate": 4.9516908212560386e-05,
104
+ "loss": 0.852,
105
+ "step": 100
106
+ },
107
+ {
108
+ "epoch": 4.78,
109
+ "learning_rate": 4.891304347826087e-05,
110
+ "loss": 0.8042,
111
+ "step": 110
112
+ },
113
+ {
114
+ "epoch": 5.0,
115
+ "eval_accuracy": 0.7545454545454545,
116
+ "eval_loss": 0.5984649658203125,
117
+ "eval_runtime": 0.6091,
118
+ "eval_samples_per_second": 180.58,
119
+ "eval_steps_per_second": 6.567,
120
+ "step": 115
121
+ },
122
+ {
123
+ "epoch": 5.22,
124
+ "learning_rate": 4.830917874396135e-05,
125
+ "loss": 0.7697,
126
+ "step": 120
127
+ },
128
+ {
129
+ "epoch": 5.65,
130
+ "learning_rate": 4.770531400966184e-05,
131
+ "loss": 0.7349,
132
+ "step": 130
133
+ },
134
+ {
135
+ "epoch": 6.0,
136
+ "eval_accuracy": 0.7545454545454545,
137
+ "eval_loss": 0.5943260788917542,
138
+ "eval_runtime": 0.6065,
139
+ "eval_samples_per_second": 181.376,
140
+ "eval_steps_per_second": 6.595,
141
+ "step": 138
142
+ },
143
+ {
144
+ "epoch": 6.09,
145
+ "learning_rate": 4.710144927536232e-05,
146
+ "loss": 0.7623,
147
+ "step": 140
148
+ },
149
+ {
150
+ "epoch": 6.52,
151
+ "learning_rate": 4.64975845410628e-05,
152
+ "loss": 0.7249,
153
+ "step": 150
154
+ },
155
+ {
156
+ "epoch": 6.96,
157
+ "learning_rate": 4.589371980676328e-05,
158
+ "loss": 0.7003,
159
+ "step": 160
160
+ },
161
+ {
162
+ "epoch": 7.0,
163
+ "eval_accuracy": 0.7636363636363637,
164
+ "eval_loss": 0.5177913904190063,
165
+ "eval_runtime": 0.6592,
166
+ "eval_samples_per_second": 166.881,
167
+ "eval_steps_per_second": 6.068,
168
+ "step": 161
169
+ },
170
+ {
171
+ "epoch": 7.39,
172
+ "learning_rate": 4.528985507246377e-05,
173
+ "loss": 0.6755,
174
+ "step": 170
175
+ },
176
+ {
177
+ "epoch": 7.83,
178
+ "learning_rate": 4.4685990338164255e-05,
179
+ "loss": 0.6641,
180
+ "step": 180
181
+ },
182
+ {
183
+ "epoch": 8.0,
184
+ "eval_accuracy": 0.7545454545454545,
185
+ "eval_loss": 0.5058346390724182,
186
+ "eval_runtime": 0.6561,
187
+ "eval_samples_per_second": 167.65,
188
+ "eval_steps_per_second": 6.096,
189
+ "step": 184
190
+ },
191
+ {
192
+ "epoch": 8.26,
193
+ "learning_rate": 4.408212560386474e-05,
194
+ "loss": 0.6263,
195
+ "step": 190
196
+ },
197
+ {
198
+ "epoch": 8.7,
199
+ "learning_rate": 4.347826086956522e-05,
200
+ "loss": 0.641,
201
+ "step": 200
202
+ },
203
+ {
204
+ "epoch": 9.0,
205
+ "eval_accuracy": 0.7909090909090909,
206
+ "eval_loss": 0.5091794729232788,
207
+ "eval_runtime": 0.6128,
208
+ "eval_samples_per_second": 179.502,
209
+ "eval_steps_per_second": 6.527,
210
+ "step": 207
211
+ },
212
+ {
213
+ "epoch": 9.13,
214
+ "learning_rate": 4.2874396135265707e-05,
215
+ "loss": 0.6213,
216
+ "step": 210
217
+ },
218
+ {
219
+ "epoch": 9.57,
220
+ "learning_rate": 4.2270531400966186e-05,
221
+ "loss": 0.599,
222
+ "step": 220
223
+ },
224
+ {
225
+ "epoch": 10.0,
226
+ "learning_rate": 4.166666666666667e-05,
227
+ "loss": 0.6571,
228
+ "step": 230
229
+ },
230
+ {
231
+ "epoch": 10.0,
232
+ "eval_accuracy": 0.7636363636363637,
233
+ "eval_loss": 0.5319333076477051,
234
+ "eval_runtime": 0.6289,
235
+ "eval_samples_per_second": 174.897,
236
+ "eval_steps_per_second": 6.36,
237
+ "step": 230
238
+ },
239
+ {
240
+ "epoch": 10.43,
241
+ "learning_rate": 4.106280193236715e-05,
242
+ "loss": 0.626,
243
+ "step": 240
244
+ },
245
+ {
246
+ "epoch": 10.87,
247
+ "learning_rate": 4.045893719806764e-05,
248
+ "loss": 0.6522,
249
+ "step": 250
250
+ },
251
+ {
252
+ "epoch": 11.0,
253
+ "eval_accuracy": 0.7909090909090909,
254
+ "eval_loss": 0.5725868344306946,
255
+ "eval_runtime": 0.7147,
256
+ "eval_samples_per_second": 153.916,
257
+ "eval_steps_per_second": 5.597,
258
+ "step": 253
259
+ },
260
+ {
261
+ "epoch": 11.3,
262
+ "learning_rate": 3.985507246376812e-05,
263
+ "loss": 0.5859,
264
+ "step": 260
265
+ },
266
+ {
267
+ "epoch": 11.74,
268
+ "learning_rate": 3.92512077294686e-05,
269
+ "loss": 0.5659,
270
+ "step": 270
271
+ },
272
+ {
273
+ "epoch": 12.0,
274
+ "eval_accuracy": 0.7727272727272727,
275
+ "eval_loss": 0.5489825010299683,
276
+ "eval_runtime": 0.6332,
277
+ "eval_samples_per_second": 173.73,
278
+ "eval_steps_per_second": 6.317,
279
+ "step": 276
280
+ },
281
+ {
282
+ "epoch": 12.17,
283
+ "learning_rate": 3.864734299516908e-05,
284
+ "loss": 0.556,
285
+ "step": 280
286
+ },
287
+ {
288
+ "epoch": 12.61,
289
+ "learning_rate": 3.804347826086957e-05,
290
+ "loss": 0.5511,
291
+ "step": 290
292
+ },
293
+ {
294
+ "epoch": 13.0,
295
+ "eval_accuracy": 0.8,
296
+ "eval_loss": 0.546451210975647,
297
+ "eval_runtime": 0.6251,
298
+ "eval_samples_per_second": 175.96,
299
+ "eval_steps_per_second": 6.399,
300
+ "step": 299
301
+ },
302
+ {
303
+ "epoch": 13.04,
304
+ "learning_rate": 3.743961352657005e-05,
305
+ "loss": 0.5614,
306
+ "step": 300
307
+ },
308
+ {
309
+ "epoch": 13.48,
310
+ "learning_rate": 3.6835748792270534e-05,
311
+ "loss": 0.5552,
312
+ "step": 310
313
+ },
314
+ {
315
+ "epoch": 13.91,
316
+ "learning_rate": 3.6231884057971014e-05,
317
+ "loss": 0.5435,
318
+ "step": 320
319
+ },
320
+ {
321
+ "epoch": 14.0,
322
+ "eval_accuracy": 0.7909090909090909,
323
+ "eval_loss": 0.5727524757385254,
324
+ "eval_runtime": 0.6272,
325
+ "eval_samples_per_second": 175.393,
326
+ "eval_steps_per_second": 6.378,
327
+ "step": 322
328
+ },
329
+ {
330
+ "epoch": 14.35,
331
+ "learning_rate": 3.56280193236715e-05,
332
+ "loss": 0.5447,
333
+ "step": 330
334
+ },
335
+ {
336
+ "epoch": 14.78,
337
+ "learning_rate": 3.502415458937198e-05,
338
+ "loss": 0.5259,
339
+ "step": 340
340
+ },
341
+ {
342
+ "epoch": 15.0,
343
+ "eval_accuracy": 0.7636363636363637,
344
+ "eval_loss": 0.6047121286392212,
345
+ "eval_runtime": 0.6091,
346
+ "eval_samples_per_second": 180.583,
347
+ "eval_steps_per_second": 6.567,
348
+ "step": 345
349
+ },
350
+ {
351
+ "epoch": 15.22,
352
+ "learning_rate": 3.4420289855072465e-05,
353
+ "loss": 0.507,
354
+ "step": 350
355
+ },
356
+ {
357
+ "epoch": 15.65,
358
+ "learning_rate": 3.381642512077295e-05,
359
+ "loss": 0.5496,
360
+ "step": 360
361
+ },
362
+ {
363
+ "epoch": 16.0,
364
+ "eval_accuracy": 0.7818181818181819,
365
+ "eval_loss": 0.6479418873786926,
366
+ "eval_runtime": 0.6141,
367
+ "eval_samples_per_second": 179.113,
368
+ "eval_steps_per_second": 6.513,
369
+ "step": 368
370
+ },
371
+ {
372
+ "epoch": 16.09,
373
+ "learning_rate": 3.321256038647343e-05,
374
+ "loss": 0.5197,
375
+ "step": 370
376
+ },
377
+ {
378
+ "epoch": 16.52,
379
+ "learning_rate": 3.260869565217392e-05,
380
+ "loss": 0.4831,
381
+ "step": 380
382
+ },
383
+ {
384
+ "epoch": 16.96,
385
+ "learning_rate": 3.2004830917874396e-05,
386
+ "loss": 0.543,
387
+ "step": 390
388
+ },
389
+ {
390
+ "epoch": 17.0,
391
+ "eval_accuracy": 0.7727272727272727,
392
+ "eval_loss": 0.6039574146270752,
393
+ "eval_runtime": 0.6076,
394
+ "eval_samples_per_second": 181.028,
395
+ "eval_steps_per_second": 6.583,
396
+ "step": 391
397
+ },
398
+ {
399
+ "epoch": 17.39,
400
+ "learning_rate": 3.140096618357488e-05,
401
+ "loss": 0.4882,
402
+ "step": 400
403
+ },
404
+ {
405
+ "epoch": 17.83,
406
+ "learning_rate": 3.079710144927536e-05,
407
+ "loss": 0.4646,
408
+ "step": 410
409
+ },
410
+ {
411
+ "epoch": 18.0,
412
+ "eval_accuracy": 0.7818181818181819,
413
+ "eval_loss": 0.6269252896308899,
414
+ "eval_runtime": 0.6351,
415
+ "eval_samples_per_second": 173.193,
416
+ "eval_steps_per_second": 6.298,
417
+ "step": 414
418
+ },
419
+ {
420
+ "epoch": 18.26,
421
+ "learning_rate": 3.0193236714975848e-05,
422
+ "loss": 0.4597,
423
+ "step": 420
424
+ },
425
+ {
426
+ "epoch": 18.7,
427
+ "learning_rate": 2.9589371980676327e-05,
428
+ "loss": 0.4867,
429
+ "step": 430
430
+ },
431
+ {
432
+ "epoch": 19.0,
433
+ "eval_accuracy": 0.7909090909090909,
434
+ "eval_loss": 0.6535181403160095,
435
+ "eval_runtime": 0.6591,
436
+ "eval_samples_per_second": 166.882,
437
+ "eval_steps_per_second": 6.068,
438
+ "step": 437
439
+ },
440
+ {
441
+ "epoch": 19.13,
442
+ "learning_rate": 2.8985507246376814e-05,
443
+ "loss": 0.4751,
444
+ "step": 440
445
+ },
446
+ {
447
+ "epoch": 19.57,
448
+ "learning_rate": 2.8381642512077293e-05,
449
+ "loss": 0.4354,
450
+ "step": 450
451
+ },
452
+ {
453
+ "epoch": 20.0,
454
+ "learning_rate": 2.777777777777778e-05,
455
+ "loss": 0.4357,
456
+ "step": 460
457
+ },
458
+ {
459
+ "epoch": 20.0,
460
+ "eval_accuracy": 0.7727272727272727,
461
+ "eval_loss": 0.6990672945976257,
462
+ "eval_runtime": 0.6056,
463
+ "eval_samples_per_second": 181.624,
464
+ "eval_steps_per_second": 6.605,
465
+ "step": 460
466
+ },
467
+ {
468
+ "epoch": 20.43,
469
+ "learning_rate": 2.7173913043478262e-05,
470
+ "loss": 0.4275,
471
+ "step": 470
472
+ },
473
+ {
474
+ "epoch": 20.87,
475
+ "learning_rate": 2.6570048309178748e-05,
476
+ "loss": 0.4392,
477
+ "step": 480
478
+ },
479
+ {
480
+ "epoch": 21.0,
481
+ "eval_accuracy": 0.7636363636363637,
482
+ "eval_loss": 0.7126674056053162,
483
+ "eval_runtime": 0.6271,
484
+ "eval_samples_per_second": 175.399,
485
+ "eval_steps_per_second": 6.378,
486
+ "step": 483
487
+ },
488
+ {
489
+ "epoch": 21.3,
490
+ "learning_rate": 2.5966183574879227e-05,
491
+ "loss": 0.4595,
492
+ "step": 490
493
+ },
494
+ {
495
+ "epoch": 21.74,
496
+ "learning_rate": 2.5362318840579714e-05,
497
+ "loss": 0.4403,
498
+ "step": 500
499
+ },
500
+ {
501
+ "epoch": 22.0,
502
+ "eval_accuracy": 0.8090909090909091,
503
+ "eval_loss": 0.6974316239356995,
504
+ "eval_runtime": 0.6812,
505
+ "eval_samples_per_second": 161.491,
506
+ "eval_steps_per_second": 5.872,
507
+ "step": 506
508
+ },
509
+ {
510
+ "epoch": 22.17,
511
+ "learning_rate": 2.4758454106280193e-05,
512
+ "loss": 0.4305,
513
+ "step": 510
514
+ },
515
+ {
516
+ "epoch": 22.61,
517
+ "learning_rate": 2.4154589371980676e-05,
518
+ "loss": 0.4358,
519
+ "step": 520
520
+ },
521
+ {
522
+ "epoch": 23.0,
523
+ "eval_accuracy": 0.7818181818181819,
524
+ "eval_loss": 0.688274085521698,
525
+ "eval_runtime": 0.6268,
526
+ "eval_samples_per_second": 175.487,
527
+ "eval_steps_per_second": 6.381,
528
+ "step": 529
529
+ },
530
+ {
531
+ "epoch": 23.04,
532
+ "learning_rate": 2.355072463768116e-05,
533
+ "loss": 0.4054,
534
+ "step": 530
535
+ },
536
+ {
537
+ "epoch": 23.48,
538
+ "learning_rate": 2.294685990338164e-05,
539
+ "loss": 0.4162,
540
+ "step": 540
541
+ },
542
+ {
543
+ "epoch": 23.91,
544
+ "learning_rate": 2.2342995169082127e-05,
545
+ "loss": 0.4094,
546
+ "step": 550
547
+ },
548
+ {
549
+ "epoch": 24.0,
550
+ "eval_accuracy": 0.8,
551
+ "eval_loss": 0.6768017411231995,
552
+ "eval_runtime": 0.6762,
553
+ "eval_samples_per_second": 162.682,
554
+ "eval_steps_per_second": 5.916,
555
+ "step": 552
556
+ },
557
+ {
558
+ "epoch": 24.35,
559
+ "learning_rate": 2.173913043478261e-05,
560
+ "loss": 0.3892,
561
+ "step": 560
562
+ },
563
+ {
564
+ "epoch": 24.78,
565
+ "learning_rate": 2.1135265700483093e-05,
566
+ "loss": 0.3913,
567
+ "step": 570
568
+ },
569
+ {
570
+ "epoch": 25.0,
571
+ "eval_accuracy": 0.7636363636363637,
572
+ "eval_loss": 0.7269611954689026,
573
+ "eval_runtime": 0.6222,
574
+ "eval_samples_per_second": 176.803,
575
+ "eval_steps_per_second": 6.429,
576
+ "step": 575
577
+ },
578
+ {
579
+ "epoch": 25.22,
580
+ "learning_rate": 2.0531400966183576e-05,
581
+ "loss": 0.3916,
582
+ "step": 580
583
+ },
584
+ {
585
+ "epoch": 25.65,
586
+ "learning_rate": 1.992753623188406e-05,
587
+ "loss": 0.3686,
588
+ "step": 590
589
+ },
590
+ {
591
+ "epoch": 26.0,
592
+ "eval_accuracy": 0.7727272727272727,
593
+ "eval_loss": 0.7104293704032898,
594
+ "eval_runtime": 0.6272,
595
+ "eval_samples_per_second": 175.395,
596
+ "eval_steps_per_second": 6.378,
597
+ "step": 598
598
+ },
599
+ {
600
+ "epoch": 26.09,
601
+ "learning_rate": 1.932367149758454e-05,
602
+ "loss": 0.4003,
603
+ "step": 600
604
+ },
605
+ {
606
+ "epoch": 26.52,
607
+ "learning_rate": 1.8719806763285024e-05,
608
+ "loss": 0.3857,
609
+ "step": 610
610
+ },
611
+ {
612
+ "epoch": 26.96,
613
+ "learning_rate": 1.8115942028985507e-05,
614
+ "loss": 0.3679,
615
+ "step": 620
616
+ },
617
+ {
618
+ "epoch": 27.0,
619
+ "eval_accuracy": 0.7818181818181819,
620
+ "eval_loss": 0.7115088701248169,
621
+ "eval_runtime": 0.6281,
622
+ "eval_samples_per_second": 175.12,
623
+ "eval_steps_per_second": 6.368,
624
+ "step": 621
625
+ },
626
+ {
627
+ "epoch": 27.39,
628
+ "learning_rate": 1.751207729468599e-05,
629
+ "loss": 0.3723,
630
+ "step": 630
631
+ },
632
+ {
633
+ "epoch": 27.83,
634
+ "learning_rate": 1.6908212560386476e-05,
635
+ "loss": 0.378,
636
+ "step": 640
637
+ },
638
+ {
639
+ "epoch": 28.0,
640
+ "eval_accuracy": 0.8090909090909091,
641
+ "eval_loss": 0.8020210862159729,
642
+ "eval_runtime": 0.6672,
643
+ "eval_samples_per_second": 164.88,
644
+ "eval_steps_per_second": 5.996,
645
+ "step": 644
646
+ },
647
+ {
648
+ "epoch": 28.26,
649
+ "learning_rate": 1.630434782608696e-05,
650
+ "loss": 0.3979,
651
+ "step": 650
652
+ },
653
+ {
654
+ "epoch": 28.7,
655
+ "learning_rate": 1.570048309178744e-05,
656
+ "loss": 0.3583,
657
+ "step": 660
658
+ },
659
+ {
660
+ "epoch": 29.0,
661
+ "eval_accuracy": 0.7909090909090909,
662
+ "eval_loss": 0.7524499893188477,
663
+ "eval_runtime": 0.6692,
664
+ "eval_samples_per_second": 164.387,
665
+ "eval_steps_per_second": 5.978,
666
+ "step": 667
667
+ },
668
+ {
669
+ "epoch": 29.13,
670
+ "learning_rate": 1.5096618357487924e-05,
671
+ "loss": 0.3708,
672
+ "step": 670
673
+ },
674
+ {
675
+ "epoch": 29.57,
676
+ "learning_rate": 1.4492753623188407e-05,
677
+ "loss": 0.3351,
678
+ "step": 680
679
+ },
680
+ {
681
+ "epoch": 30.0,
682
+ "learning_rate": 1.388888888888889e-05,
683
+ "loss": 0.3299,
684
+ "step": 690
685
+ },
686
+ {
687
+ "epoch": 30.0,
688
+ "eval_accuracy": 0.7909090909090909,
689
+ "eval_loss": 0.7783340215682983,
690
+ "eval_runtime": 0.6563,
691
+ "eval_samples_per_second": 167.609,
692
+ "eval_steps_per_second": 6.095,
693
+ "step": 690
694
+ },
695
+ {
696
+ "epoch": 30.43,
697
+ "learning_rate": 1.3285024154589374e-05,
698
+ "loss": 0.3476,
699
+ "step": 700
700
+ },
701
+ {
702
+ "epoch": 30.87,
703
+ "learning_rate": 1.2681159420289857e-05,
704
+ "loss": 0.3672,
705
+ "step": 710
706
+ },
707
+ {
708
+ "epoch": 31.0,
709
+ "eval_accuracy": 0.7909090909090909,
710
+ "eval_loss": 0.8193163871765137,
711
+ "eval_runtime": 0.6541,
712
+ "eval_samples_per_second": 168.158,
713
+ "eval_steps_per_second": 6.115,
714
+ "step": 713
715
+ },
716
+ {
717
+ "epoch": 31.3,
718
+ "learning_rate": 1.2077294685990338e-05,
719
+ "loss": 0.3257,
720
+ "step": 720
721
+ },
722
+ {
723
+ "epoch": 31.74,
724
+ "learning_rate": 1.147342995169082e-05,
725
+ "loss": 0.3567,
726
+ "step": 730
727
+ },
728
+ {
729
+ "epoch": 32.0,
730
+ "eval_accuracy": 0.7909090909090909,
731
+ "eval_loss": 0.809545636177063,
732
+ "eval_runtime": 0.6397,
733
+ "eval_samples_per_second": 171.969,
734
+ "eval_steps_per_second": 6.253,
735
+ "step": 736
736
+ },
737
+ {
738
+ "epoch": 32.17,
739
+ "learning_rate": 1.0869565217391305e-05,
740
+ "loss": 0.32,
741
+ "step": 740
742
+ },
743
+ {
744
+ "epoch": 32.61,
745
+ "learning_rate": 1.0265700483091788e-05,
746
+ "loss": 0.3585,
747
+ "step": 750
748
+ },
749
+ {
750
+ "epoch": 33.0,
751
+ "eval_accuracy": 0.7909090909090909,
752
+ "eval_loss": 0.8323901295661926,
753
+ "eval_runtime": 0.6281,
754
+ "eval_samples_per_second": 175.12,
755
+ "eval_steps_per_second": 6.368,
756
+ "step": 759
757
+ },
758
+ {
759
+ "epoch": 33.04,
760
+ "learning_rate": 9.66183574879227e-06,
761
+ "loss": 0.3212,
762
+ "step": 760
763
+ },
764
+ {
765
+ "epoch": 33.48,
766
+ "learning_rate": 9.057971014492753e-06,
767
+ "loss": 0.3189,
768
+ "step": 770
769
+ },
770
+ {
771
+ "epoch": 33.91,
772
+ "learning_rate": 8.454106280193238e-06,
773
+ "loss": 0.3191,
774
+ "step": 780
775
+ },
776
+ {
777
+ "epoch": 34.0,
778
+ "eval_accuracy": 0.7909090909090909,
779
+ "eval_loss": 0.8041682243347168,
780
+ "eval_runtime": 0.6299,
781
+ "eval_samples_per_second": 174.635,
782
+ "eval_steps_per_second": 6.35,
783
+ "step": 782
784
+ },
785
+ {
786
+ "epoch": 34.35,
787
+ "learning_rate": 7.85024154589372e-06,
788
+ "loss": 0.3019,
789
+ "step": 790
790
+ },
791
+ {
792
+ "epoch": 34.78,
793
+ "learning_rate": 7.246376811594203e-06,
794
+ "loss": 0.3144,
795
+ "step": 800
796
+ },
797
+ {
798
+ "epoch": 35.0,
799
+ "eval_accuracy": 0.7909090909090909,
800
+ "eval_loss": 0.8189137578010559,
801
+ "eval_runtime": 0.6467,
802
+ "eval_samples_per_second": 170.106,
803
+ "eval_steps_per_second": 6.186,
804
+ "step": 805
805
+ },
806
+ {
807
+ "epoch": 35.22,
808
+ "learning_rate": 6.642512077294687e-06,
809
+ "loss": 0.333,
810
+ "step": 810
811
+ },
812
+ {
813
+ "epoch": 35.65,
814
+ "learning_rate": 6.038647342995169e-06,
815
+ "loss": 0.3452,
816
+ "step": 820
817
+ },
818
+ {
819
+ "epoch": 36.0,
820
+ "eval_accuracy": 0.7909090909090909,
821
+ "eval_loss": 0.8377164006233215,
822
+ "eval_runtime": 0.6036,
823
+ "eval_samples_per_second": 182.227,
824
+ "eval_steps_per_second": 6.626,
825
+ "step": 828
826
+ },
827
+ {
828
+ "epoch": 36.09,
829
+ "learning_rate": 5.4347826086956525e-06,
830
+ "loss": 0.2989,
831
+ "step": 830
832
+ },
833
+ {
834
+ "epoch": 36.52,
835
+ "learning_rate": 4.830917874396135e-06,
836
+ "loss": 0.2819,
837
+ "step": 840
838
+ },
839
+ {
840
+ "epoch": 36.96,
841
+ "learning_rate": 4.227053140096619e-06,
842
+ "loss": 0.3263,
843
+ "step": 850
844
+ },
845
+ {
846
+ "epoch": 37.0,
847
+ "eval_accuracy": 0.7909090909090909,
848
+ "eval_loss": 0.8204471468925476,
849
+ "eval_runtime": 0.6006,
850
+ "eval_samples_per_second": 183.137,
851
+ "eval_steps_per_second": 6.66,
852
+ "step": 851
853
+ },
854
+ {
855
+ "epoch": 37.39,
856
+ "learning_rate": 3.6231884057971017e-06,
857
+ "loss": 0.3016,
858
+ "step": 860
859
+ },
860
+ {
861
+ "epoch": 37.83,
862
+ "learning_rate": 3.0193236714975845e-06,
863
+ "loss": 0.2939,
864
+ "step": 870
865
+ },
866
+ {
867
+ "epoch": 38.0,
868
+ "eval_accuracy": 0.7909090909090909,
869
+ "eval_loss": 0.810295581817627,
870
+ "eval_runtime": 0.6091,
871
+ "eval_samples_per_second": 180.583,
872
+ "eval_steps_per_second": 6.567,
873
+ "step": 874
874
+ },
875
+ {
876
+ "epoch": 38.26,
877
+ "learning_rate": 2.4154589371980677e-06,
878
+ "loss": 0.2872,
879
+ "step": 880
880
+ },
881
+ {
882
+ "epoch": 38.7,
883
+ "learning_rate": 1.8115942028985508e-06,
884
+ "loss": 0.3152,
885
+ "step": 890
886
+ },
887
+ {
888
+ "epoch": 39.0,
889
+ "eval_accuracy": 0.7818181818181819,
890
+ "eval_loss": 0.8183740973472595,
891
+ "eval_runtime": 0.6091,
892
+ "eval_samples_per_second": 180.583,
893
+ "eval_steps_per_second": 6.567,
894
+ "step": 897
895
+ },
896
+ {
897
+ "epoch": 39.13,
898
+ "learning_rate": 1.2077294685990338e-06,
899
+ "loss": 0.3059,
900
+ "step": 900
901
+ },
902
+ {
903
+ "epoch": 39.57,
904
+ "learning_rate": 6.038647342995169e-07,
905
+ "loss": 0.3041,
906
+ "step": 910
907
+ },
908
+ {
909
+ "epoch": 40.0,
910
+ "learning_rate": 0.0,
911
+ "loss": 0.2787,
912
+ "step": 920
913
+ },
914
+ {
915
+ "epoch": 40.0,
916
+ "eval_accuracy": 0.7818181818181819,
917
+ "eval_loss": 0.8240975141525269,
918
+ "eval_runtime": 0.6032,
919
+ "eval_samples_per_second": 182.376,
920
+ "eval_steps_per_second": 6.632,
921
+ "step": 920
922
+ },
923
+ {
924
+ "epoch": 40.0,
925
+ "step": 920,
926
+ "total_flos": 3.825055592868741e+18,
927
+ "train_loss": 0.5426292188789533,
928
+ "train_runtime": 1410.8495,
929
+ "train_samples_per_second": 83.326,
930
+ "train_steps_per_second": 0.652
931
+ }
932
+ ],
933
+ "logging_steps": 10,
934
+ "max_steps": 920,
935
+ "num_input_tokens_seen": 0,
936
+ "num_train_epochs": 40,
937
+ "save_steps": 500,
938
+ "total_flos": 3.825055592868741e+18,
939
+ "train_batch_size": 32,
940
+ "trial_name": null,
941
+ "trial_params": null
942
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b782ec53752d361856b7ec65955590b0c23d3b67232bb52a728031383b2985d8
3
+ size 4792