Atishjn commited on
Commit
a4f5eaf
·
verified ·
1 Parent(s): e3f7b87

Initial model upload

Browse files
README.md ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: sentence-transformers/all-mpnet-base-v2
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ - f1
10
+ - precision
11
+ - recall
12
+ model-index:
13
+ - name: classify-bluesky-1000-v2
14
+ results: []
15
+ ---
16
+
17
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
18
+ should probably proofread and complete it, then remove this comment. -->
19
+
20
+ # classify-bluesky-1000-v2
21
+
22
+ This model is a fine-tuned version of [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) on an unknown dataset.
23
+ It achieves the following results on the evaluation set:
24
+ - Loss: 0.0055
25
+ - Accuracy: 0.9994
26
+ - F1: 0.9994
27
+ - Precision: 0.9994
28
+ - Recall: 0.9994
29
+ - Accuracy Label Bluesky: 1.0
30
+ - Accuracy Label Non bluesky: 0.9992
31
+
32
+ ## Model description
33
+
34
+ More information needed
35
+
36
+ ## Intended uses & limitations
37
+
38
+ More information needed
39
+
40
+ ## Training and evaluation data
41
+
42
+ More information needed
43
+
44
+ ## Training procedure
45
+
46
+ ### Training hyperparameters
47
+
48
+ The following hyperparameters were used during training:
49
+ - learning_rate: 2e-05
50
+ - train_batch_size: 16
51
+ - eval_batch_size: 16
52
+ - seed: 42
53
+ - gradient_accumulation_steps: 2
54
+ - total_train_batch_size: 32
55
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
56
+ - lr_scheduler_type: linear
57
+ - lr_scheduler_warmup_steps: 500
58
+ - num_epochs: 3
59
+
60
+ ### Training results
61
+
62
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy | F1 | Precision | Recall | Accuracy Label Bluesky | Accuracy Label Non bluesky |
63
+ |:-------------:|:------:|:----:|:---------------:|:--------:|:------:|:---------:|:------:|:----------------------:|:--------------------------:|
64
+ | 0.514 | 0.4292 | 100 | 0.4462 | 0.8747 | 0.8577 | 0.8925 | 0.8747 | 0.4832 | 1.0 |
65
+ | 0.095 | 0.8584 | 200 | 0.0717 | 0.9984 | 0.9984 | 0.9984 | 0.9984 | 0.9948 | 0.9996 |
66
+ | 0.031 | 1.2876 | 300 | 0.0230 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
67
+ | 0.0139 | 1.7167 | 400 | 0.0099 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 |
68
+ | 0.007 | 2.1459 | 500 | 0.0259 | 0.9947 | 0.9947 | 0.9948 | 0.9947 | 1.0 | 0.9930 |
69
+ | 0.0045 | 2.5751 | 600 | 0.0060 | 0.9994 | 0.9994 | 0.9994 | 0.9994 | 1.0 | 0.9992 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - Transformers 4.49.0
75
+ - Pytorch 2.5.1+cu124
76
+ - Datasets 3.3.2
77
+ - Tokenizers 0.21.0
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "sentence-transformers/all-mpnet-base-v2",
3
+ "architectures": [
4
+ "MPNetForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "mpnet",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "problem_type": "single_label_classification",
21
+ "relative_attention_num_buckets": 32,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.49.0",
24
+ "vocab_size": 30527
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68212d994b81d6514eb1e0b7e7adb59a315c726f03b3c6eed3b61dda64620ea9
3
+ size 437975200
runs/Mar04_03-47-59_5092f3fe7514/events.out.tfevents.1741060091.5092f3fe7514.891.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d7c6b641cbfceee73ea680c0662197550a0d1d6cad005fc9d948059fbfd4e2f
3
+ size 5057
runs/Mar04_03-55-40_5092f3fe7514/events.out.tfevents.1741060552.5092f3fe7514.891.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:393b8a1b478a634d4576a43810e17dcbee299a72f16d086744d9c8d018b2a56b
3
+ size 23559
runs/Mar04_03-55-40_5092f3fe7514/events.out.tfevents.1741061250.5092f3fe7514.891.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa18207fbeca21c79f1ed51a3d30691e5959734e4b3c742285452f5a3d960836
3
+ size 696
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "[UNK]",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "104": {
36
+ "content": "[UNK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30526": {
44
+ "content": "<mask>",
45
+ "lstrip": true,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ }
51
+ },
52
+ "bos_token": "<s>",
53
+ "clean_up_tokenization_spaces": false,
54
+ "cls_token": "<s>",
55
+ "do_lower_case": true,
56
+ "eos_token": "</s>",
57
+ "extra_special_tokens": {},
58
+ "mask_token": "<mask>",
59
+ "max_length": 128,
60
+ "model_max_length": 512,
61
+ "pad_to_multiple_of": null,
62
+ "pad_token": "<pad>",
63
+ "pad_token_type_id": 0,
64
+ "padding_side": "right",
65
+ "sep_token": "</s>",
66
+ "stride": 0,
67
+ "strip_accents": null,
68
+ "tokenize_chinese_chars": true,
69
+ "tokenizer_class": "MPNetTokenizer",
70
+ "truncation_side": "right",
71
+ "truncation_strategy": "longest_first",
72
+ "unk_token": "[UNK]"
73
+ }
trainer_state.json ADDED
@@ -0,0 +1,623 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 100,
6
+ "global_step": 699,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.04291845493562232,
13
+ "grad_norm": 0.8440808057785034,
14
+ "learning_rate": 4.0000000000000003e-07,
15
+ "loss": 0.6863,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.08583690987124463,
20
+ "grad_norm": 0.7271698117256165,
21
+ "learning_rate": 8.000000000000001e-07,
22
+ "loss": 0.6822,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.12875536480686695,
27
+ "grad_norm": 0.772187352180481,
28
+ "learning_rate": 1.2000000000000002e-06,
29
+ "loss": 0.6756,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.17167381974248927,
34
+ "grad_norm": 0.8175302743911743,
35
+ "learning_rate": 1.6000000000000001e-06,
36
+ "loss": 0.6596,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.2145922746781116,
41
+ "grad_norm": 0.6373170614242554,
42
+ "learning_rate": 2.0000000000000003e-06,
43
+ "loss": 0.6501,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.2575107296137339,
48
+ "grad_norm": 0.6438242197036743,
49
+ "learning_rate": 2.4000000000000003e-06,
50
+ "loss": 0.6365,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.30042918454935624,
55
+ "grad_norm": 1.0084680318832397,
56
+ "learning_rate": 2.8000000000000003e-06,
57
+ "loss": 0.6066,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.34334763948497854,
62
+ "grad_norm": 0.736494779586792,
63
+ "learning_rate": 3.2000000000000003e-06,
64
+ "loss": 0.5716,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.38626609442060084,
69
+ "grad_norm": 0.7631027698516846,
70
+ "learning_rate": 3.6000000000000003e-06,
71
+ "loss": 0.5571,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.4291845493562232,
76
+ "grad_norm": 1.0123592615127563,
77
+ "learning_rate": 4.000000000000001e-06,
78
+ "loss": 0.514,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.4291845493562232,
83
+ "eval_accuracy": 0.8747259630441591,
84
+ "eval_accuracy_label_bluesky": 0.48320413436692505,
85
+ "eval_accuracy_label_non bluesky": 1.0,
86
+ "eval_f1": 0.8576844954005327,
87
+ "eval_loss": 0.4462388753890991,
88
+ "eval_precision": 0.8925016334174604,
89
+ "eval_recall": 0.8747259630441591,
90
+ "eval_runtime": 16.059,
91
+ "eval_samples_per_second": 198.83,
92
+ "eval_steps_per_second": 12.454,
93
+ "step": 100
94
+ },
95
+ {
96
+ "epoch": 0.4721030042918455,
97
+ "grad_norm": 0.9895055890083313,
98
+ "learning_rate": 4.4e-06,
99
+ "loss": 0.4513,
100
+ "step": 110
101
+ },
102
+ {
103
+ "epoch": 0.5150214592274678,
104
+ "grad_norm": 1.004581332206726,
105
+ "learning_rate": 4.800000000000001e-06,
106
+ "loss": 0.368,
107
+ "step": 120
108
+ },
109
+ {
110
+ "epoch": 0.5579399141630901,
111
+ "grad_norm": 1.0057802200317383,
112
+ "learning_rate": 5.2e-06,
113
+ "loss": 0.3251,
114
+ "step": 130
115
+ },
116
+ {
117
+ "epoch": 0.6008583690987125,
118
+ "grad_norm": 0.8673408627510071,
119
+ "learning_rate": 5.600000000000001e-06,
120
+ "loss": 0.2666,
121
+ "step": 140
122
+ },
123
+ {
124
+ "epoch": 0.6437768240343348,
125
+ "grad_norm": 0.8042262196540833,
126
+ "learning_rate": 6e-06,
127
+ "loss": 0.2042,
128
+ "step": 150
129
+ },
130
+ {
131
+ "epoch": 0.6866952789699571,
132
+ "grad_norm": 0.705209493637085,
133
+ "learning_rate": 6.4000000000000006e-06,
134
+ "loss": 0.1673,
135
+ "step": 160
136
+ },
137
+ {
138
+ "epoch": 0.7296137339055794,
139
+ "grad_norm": 0.6522665023803711,
140
+ "learning_rate": 6.800000000000001e-06,
141
+ "loss": 0.1469,
142
+ "step": 170
143
+ },
144
+ {
145
+ "epoch": 0.7725321888412017,
146
+ "grad_norm": 0.6036862134933472,
147
+ "learning_rate": 7.2000000000000005e-06,
148
+ "loss": 0.124,
149
+ "step": 180
150
+ },
151
+ {
152
+ "epoch": 0.8154506437768241,
153
+ "grad_norm": 0.5399958491325378,
154
+ "learning_rate": 7.600000000000001e-06,
155
+ "loss": 0.1105,
156
+ "step": 190
157
+ },
158
+ {
159
+ "epoch": 0.8583690987124464,
160
+ "grad_norm": 0.49989593029022217,
161
+ "learning_rate": 8.000000000000001e-06,
162
+ "loss": 0.095,
163
+ "step": 200
164
+ },
165
+ {
166
+ "epoch": 0.8583690987124464,
167
+ "eval_accuracy": 0.998434074538052,
168
+ "eval_accuracy_label_bluesky": 0.9948320413436692,
169
+ "eval_accuracy_label_non bluesky": 0.9995866060355518,
170
+ "eval_f1": 0.9984330394292978,
171
+ "eval_loss": 0.07170984894037247,
172
+ "eval_precision": 0.9984344076205467,
173
+ "eval_recall": 0.998434074538052,
174
+ "eval_runtime": 16.3727,
175
+ "eval_samples_per_second": 195.019,
176
+ "eval_steps_per_second": 12.215,
177
+ "step": 200
178
+ },
179
+ {
180
+ "epoch": 0.9012875536480687,
181
+ "grad_norm": 4.4411444664001465,
182
+ "learning_rate": 8.400000000000001e-06,
183
+ "loss": 0.0998,
184
+ "step": 210
185
+ },
186
+ {
187
+ "epoch": 0.944206008583691,
188
+ "grad_norm": 0.4215564727783203,
189
+ "learning_rate": 8.8e-06,
190
+ "loss": 0.0759,
191
+ "step": 220
192
+ },
193
+ {
194
+ "epoch": 0.9871244635193133,
195
+ "grad_norm": 0.38805362582206726,
196
+ "learning_rate": 9.200000000000002e-06,
197
+ "loss": 0.0651,
198
+ "step": 230
199
+ },
200
+ {
201
+ "epoch": 1.0300429184549356,
202
+ "grad_norm": 0.3541722595691681,
203
+ "learning_rate": 9.600000000000001e-06,
204
+ "loss": 0.058,
205
+ "step": 240
206
+ },
207
+ {
208
+ "epoch": 1.0729613733905579,
209
+ "grad_norm": 0.3281799554824829,
210
+ "learning_rate": 1e-05,
211
+ "loss": 0.0527,
212
+ "step": 250
213
+ },
214
+ {
215
+ "epoch": 1.1158798283261802,
216
+ "grad_norm": 0.3049239218235016,
217
+ "learning_rate": 1.04e-05,
218
+ "loss": 0.047,
219
+ "step": 260
220
+ },
221
+ {
222
+ "epoch": 1.1587982832618025,
223
+ "grad_norm": 0.27242380380630493,
224
+ "learning_rate": 1.0800000000000002e-05,
225
+ "loss": 0.042,
226
+ "step": 270
227
+ },
228
+ {
229
+ "epoch": 1.201716738197425,
230
+ "grad_norm": 0.26639366149902344,
231
+ "learning_rate": 1.1200000000000001e-05,
232
+ "loss": 0.0381,
233
+ "step": 280
234
+ },
235
+ {
236
+ "epoch": 1.2446351931330473,
237
+ "grad_norm": 0.23575416207313538,
238
+ "learning_rate": 1.16e-05,
239
+ "loss": 0.035,
240
+ "step": 290
241
+ },
242
+ {
243
+ "epoch": 1.2875536480686696,
244
+ "grad_norm": 0.2203730046749115,
245
+ "learning_rate": 1.2e-05,
246
+ "loss": 0.031,
247
+ "step": 300
248
+ },
249
+ {
250
+ "epoch": 1.2875536480686696,
251
+ "eval_accuracy": 1.0,
252
+ "eval_accuracy_label_bluesky": 1.0,
253
+ "eval_accuracy_label_non bluesky": 1.0,
254
+ "eval_f1": 1.0,
255
+ "eval_loss": 0.022965509444475174,
256
+ "eval_precision": 1.0,
257
+ "eval_recall": 1.0,
258
+ "eval_runtime": 16.4605,
259
+ "eval_samples_per_second": 193.979,
260
+ "eval_steps_per_second": 12.15,
261
+ "step": 300
262
+ },
263
+ {
264
+ "epoch": 1.3304721030042919,
265
+ "grad_norm": 0.2094569206237793,
266
+ "learning_rate": 1.2400000000000002e-05,
267
+ "loss": 0.0285,
268
+ "step": 310
269
+ },
270
+ {
271
+ "epoch": 1.3733905579399142,
272
+ "grad_norm": 0.18893343210220337,
273
+ "learning_rate": 1.2800000000000001e-05,
274
+ "loss": 0.0255,
275
+ "step": 320
276
+ },
277
+ {
278
+ "epoch": 1.4163090128755365,
279
+ "grad_norm": 0.1822691410779953,
280
+ "learning_rate": 1.3200000000000002e-05,
281
+ "loss": 0.0235,
282
+ "step": 330
283
+ },
284
+ {
285
+ "epoch": 1.4592274678111588,
286
+ "grad_norm": 0.2712848484516144,
287
+ "learning_rate": 1.3600000000000002e-05,
288
+ "loss": 0.0365,
289
+ "step": 340
290
+ },
291
+ {
292
+ "epoch": 1.5021459227467813,
293
+ "grad_norm": 27.6467227935791,
294
+ "learning_rate": 1.4e-05,
295
+ "loss": 0.0442,
296
+ "step": 350
297
+ },
298
+ {
299
+ "epoch": 1.5450643776824036,
300
+ "grad_norm": 0.15123893320560455,
301
+ "learning_rate": 1.4400000000000001e-05,
302
+ "loss": 0.0316,
303
+ "step": 360
304
+ },
305
+ {
306
+ "epoch": 1.5879828326180259,
307
+ "grad_norm": 0.14126092195510864,
308
+ "learning_rate": 1.48e-05,
309
+ "loss": 0.0174,
310
+ "step": 370
311
+ },
312
+ {
313
+ "epoch": 1.6309012875536482,
314
+ "grad_norm": 0.13003908097743988,
315
+ "learning_rate": 1.5200000000000002e-05,
316
+ "loss": 0.016,
317
+ "step": 380
318
+ },
319
+ {
320
+ "epoch": 1.6738197424892705,
321
+ "grad_norm": 0.12764938175678253,
322
+ "learning_rate": 1.5600000000000003e-05,
323
+ "loss": 0.015,
324
+ "step": 390
325
+ },
326
+ {
327
+ "epoch": 1.7167381974248928,
328
+ "grad_norm": 0.12266356498003006,
329
+ "learning_rate": 1.6000000000000003e-05,
330
+ "loss": 0.0139,
331
+ "step": 400
332
+ },
333
+ {
334
+ "epoch": 1.7167381974248928,
335
+ "eval_accuracy": 1.0,
336
+ "eval_accuracy_label_bluesky": 1.0,
337
+ "eval_accuracy_label_non bluesky": 1.0,
338
+ "eval_f1": 1.0,
339
+ "eval_loss": 0.009904815815389156,
340
+ "eval_precision": 1.0,
341
+ "eval_recall": 1.0,
342
+ "eval_runtime": 16.3409,
343
+ "eval_samples_per_second": 195.4,
344
+ "eval_steps_per_second": 12.239,
345
+ "step": 400
346
+ },
347
+ {
348
+ "epoch": 1.759656652360515,
349
+ "grad_norm": 0.10915858298540115,
350
+ "learning_rate": 1.64e-05,
351
+ "loss": 0.0126,
352
+ "step": 410
353
+ },
354
+ {
355
+ "epoch": 1.8025751072961373,
356
+ "grad_norm": 0.10050816088914871,
357
+ "learning_rate": 1.6800000000000002e-05,
358
+ "loss": 0.0123,
359
+ "step": 420
360
+ },
361
+ {
362
+ "epoch": 1.8454935622317596,
363
+ "grad_norm": 0.09750162065029144,
364
+ "learning_rate": 1.72e-05,
365
+ "loss": 0.0135,
366
+ "step": 430
367
+ },
368
+ {
369
+ "epoch": 1.888412017167382,
370
+ "grad_norm": 0.09050226211547852,
371
+ "learning_rate": 1.76e-05,
372
+ "loss": 0.0101,
373
+ "step": 440
374
+ },
375
+ {
376
+ "epoch": 1.9313304721030042,
377
+ "grad_norm": 0.08614258468151093,
378
+ "learning_rate": 1.8e-05,
379
+ "loss": 0.0203,
380
+ "step": 450
381
+ },
382
+ {
383
+ "epoch": 1.9742489270386265,
384
+ "grad_norm": 0.08039192855358124,
385
+ "learning_rate": 1.8400000000000003e-05,
386
+ "loss": 0.0109,
387
+ "step": 460
388
+ },
389
+ {
390
+ "epoch": 2.017167381974249,
391
+ "grad_norm": 0.07501527667045593,
392
+ "learning_rate": 1.88e-05,
393
+ "loss": 0.0084,
394
+ "step": 470
395
+ },
396
+ {
397
+ "epoch": 2.060085836909871,
398
+ "grad_norm": 31.131675720214844,
399
+ "learning_rate": 1.9200000000000003e-05,
400
+ "loss": 0.0134,
401
+ "step": 480
402
+ },
403
+ {
404
+ "epoch": 2.1030042918454934,
405
+ "grad_norm": 0.0719398781657219,
406
+ "learning_rate": 1.9600000000000002e-05,
407
+ "loss": 0.0311,
408
+ "step": 490
409
+ },
410
+ {
411
+ "epoch": 2.1459227467811157,
412
+ "grad_norm": 0.06952013075351715,
413
+ "learning_rate": 2e-05,
414
+ "loss": 0.007,
415
+ "step": 500
416
+ },
417
+ {
418
+ "epoch": 2.1459227467811157,
419
+ "eval_accuracy": 0.9946758534293768,
420
+ "eval_accuracy_label_bluesky": 1.0,
421
+ "eval_accuracy_label_non bluesky": 0.992972302604382,
422
+ "eval_f1": 0.994695383411754,
423
+ "eval_loss": 0.02586308866739273,
424
+ "eval_precision": 0.9947902788297568,
425
+ "eval_recall": 0.9946758534293768,
426
+ "eval_runtime": 16.3887,
427
+ "eval_samples_per_second": 194.83,
428
+ "eval_steps_per_second": 12.204,
429
+ "step": 500
430
+ },
431
+ {
432
+ "epoch": 2.188841201716738,
433
+ "grad_norm": 0.06342656165361404,
434
+ "learning_rate": 1.899497487437186e-05,
435
+ "loss": 0.0267,
436
+ "step": 510
437
+ },
438
+ {
439
+ "epoch": 2.2317596566523603,
440
+ "grad_norm": 0.05855906754732132,
441
+ "learning_rate": 1.798994974874372e-05,
442
+ "loss": 0.0062,
443
+ "step": 520
444
+ },
445
+ {
446
+ "epoch": 2.274678111587983,
447
+ "grad_norm": 0.05645829439163208,
448
+ "learning_rate": 1.698492462311558e-05,
449
+ "loss": 0.0058,
450
+ "step": 530
451
+ },
452
+ {
453
+ "epoch": 2.317596566523605,
454
+ "grad_norm": 0.05568385869264603,
455
+ "learning_rate": 1.5979899497487437e-05,
456
+ "loss": 0.0056,
457
+ "step": 540
458
+ },
459
+ {
460
+ "epoch": 2.3605150214592276,
461
+ "grad_norm": 0.051322419196367264,
462
+ "learning_rate": 1.4974874371859299e-05,
463
+ "loss": 0.0052,
464
+ "step": 550
465
+ },
466
+ {
467
+ "epoch": 2.40343347639485,
468
+ "grad_norm": 2.771406650543213,
469
+ "learning_rate": 1.3969849246231157e-05,
470
+ "loss": 0.0206,
471
+ "step": 560
472
+ },
473
+ {
474
+ "epoch": 2.4463519313304722,
475
+ "grad_norm": 0.04993780702352524,
476
+ "learning_rate": 1.2964824120603017e-05,
477
+ "loss": 0.0049,
478
+ "step": 570
479
+ },
480
+ {
481
+ "epoch": 2.4892703862660945,
482
+ "grad_norm": 0.04749375954270363,
483
+ "learning_rate": 1.1959798994974876e-05,
484
+ "loss": 0.0047,
485
+ "step": 580
486
+ },
487
+ {
488
+ "epoch": 2.532188841201717,
489
+ "grad_norm": 0.04730561375617981,
490
+ "learning_rate": 1.0954773869346736e-05,
491
+ "loss": 0.0045,
492
+ "step": 590
493
+ },
494
+ {
495
+ "epoch": 2.575107296137339,
496
+ "grad_norm": 0.048863768577575684,
497
+ "learning_rate": 9.949748743718594e-06,
498
+ "loss": 0.0045,
499
+ "step": 600
500
+ },
501
+ {
502
+ "epoch": 2.575107296137339,
503
+ "eval_accuracy": 0.9993736298152208,
504
+ "eval_accuracy_label_bluesky": 1.0,
505
+ "eval_accuracy_label_non bluesky": 0.9991732120711038,
506
+ "eval_f1": 0.9993739044026467,
507
+ "eval_loss": 0.006034924183040857,
508
+ "eval_precision": 0.999375244171367,
509
+ "eval_recall": 0.9993736298152208,
510
+ "eval_runtime": 16.3578,
511
+ "eval_samples_per_second": 195.197,
512
+ "eval_steps_per_second": 12.227,
513
+ "step": 600
514
+ },
515
+ {
516
+ "epoch": 2.6180257510729614,
517
+ "grad_norm": 0.045305851846933365,
518
+ "learning_rate": 8.944723618090452e-06,
519
+ "loss": 0.0043,
520
+ "step": 610
521
+ },
522
+ {
523
+ "epoch": 2.6609442060085837,
524
+ "grad_norm": 0.04384471848607063,
525
+ "learning_rate": 7.939698492462312e-06,
526
+ "loss": 0.0042,
527
+ "step": 620
528
+ },
529
+ {
530
+ "epoch": 2.703862660944206,
531
+ "grad_norm": 0.04265659675002098,
532
+ "learning_rate": 6.934673366834172e-06,
533
+ "loss": 0.0213,
534
+ "step": 630
535
+ },
536
+ {
537
+ "epoch": 2.7467811158798283,
538
+ "grad_norm": 0.04311639815568924,
539
+ "learning_rate": 5.9296482412060305e-06,
540
+ "loss": 0.0041,
541
+ "step": 640
542
+ },
543
+ {
544
+ "epoch": 2.7896995708154506,
545
+ "grad_norm": 0.04334357753396034,
546
+ "learning_rate": 4.92462311557789e-06,
547
+ "loss": 0.004,
548
+ "step": 650
549
+ },
550
+ {
551
+ "epoch": 2.832618025751073,
552
+ "grad_norm": 0.0423327311873436,
553
+ "learning_rate": 3.919597989949749e-06,
554
+ "loss": 0.004,
555
+ "step": 660
556
+ },
557
+ {
558
+ "epoch": 2.875536480686695,
559
+ "grad_norm": 0.040839340537786484,
560
+ "learning_rate": 2.914572864321608e-06,
561
+ "loss": 0.004,
562
+ "step": 670
563
+ },
564
+ {
565
+ "epoch": 2.9184549356223175,
566
+ "grad_norm": 0.043217360973358154,
567
+ "learning_rate": 1.9095477386934674e-06,
568
+ "loss": 0.0039,
569
+ "step": 680
570
+ },
571
+ {
572
+ "epoch": 2.96137339055794,
573
+ "grad_norm": 0.043392766267061234,
574
+ "learning_rate": 9.045226130653267e-07,
575
+ "loss": 0.0039,
576
+ "step": 690
577
+ },
578
+ {
579
+ "epoch": 3.0,
580
+ "step": 699,
581
+ "total_flos": 1249265679735120.0,
582
+ "train_loss": 0.137061902504409,
583
+ "train_runtime": 530.5883,
584
+ "train_samples_per_second": 42.123,
585
+ "train_steps_per_second": 1.317
586
+ },
587
+ {
588
+ "epoch": 3.0,
589
+ "eval_accuracy": 0.9993736298152208,
590
+ "eval_accuracy_label_bluesky": 1.0,
591
+ "eval_accuracy_label_non bluesky": 0.9991732120711038,
592
+ "eval_f1": 0.9993739044026467,
593
+ "eval_loss": 0.00547385448589921,
594
+ "eval_precision": 0.999375244171367,
595
+ "eval_recall": 0.9993736298152208,
596
+ "eval_runtime": 17.5335,
597
+ "eval_samples_per_second": 182.109,
598
+ "eval_steps_per_second": 11.407,
599
+ "step": 699
600
+ }
601
+ ],
602
+ "logging_steps": 10,
603
+ "max_steps": 699,
604
+ "num_input_tokens_seen": 0,
605
+ "num_train_epochs": 3,
606
+ "save_steps": 1000,
607
+ "stateful_callbacks": {
608
+ "TrainerControl": {
609
+ "args": {
610
+ "should_epoch_stop": false,
611
+ "should_evaluate": false,
612
+ "should_log": false,
613
+ "should_save": true,
614
+ "should_training_stop": true
615
+ },
616
+ "attributes": {}
617
+ }
618
+ },
619
+ "total_flos": 1249265679735120.0,
620
+ "train_batch_size": 16,
621
+ "trial_name": null,
622
+ "trial_params": null
623
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4923d3eaab1100f1b1c385a6806e9b522aaee02bf3b46d3221104ac9f4954f3f
3
+ size 5304
vocab.txt ADDED
The diff for this file is too large to render. See raw diff