corranm commited on
Commit
428ed53
·
verified ·
1 Parent(s): ee0c72e

End of training

Browse files
README.md ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google/vit-base-patch16-224
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: square_run_32_batch
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # square_run_32_batch
18
+
19
+ This model is a fine-tuned version of [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 1.6241
22
+ - F1 Macro: 0.5019
23
+ - F1 Micro: 0.5758
24
+ - F1 Weighted: 0.5679
25
+ - Precision Macro: 0.5021
26
+ - Precision Micro: 0.5758
27
+ - Precision Weighted: 0.5657
28
+ - Recall Macro: 0.5073
29
+ - Recall Micro: 0.5758
30
+ - Recall Weighted: 0.5758
31
+ - Accuracy: 0.5758
32
+
33
+ ## Model description
34
+
35
+ More information needed
36
+
37
+ ## Intended uses & limitations
38
+
39
+ More information needed
40
+
41
+ ## Training and evaluation data
42
+
43
+ More information needed
44
+
45
+ ## Training procedure
46
+
47
+ ### Training hyperparameters
48
+
49
+ The following hyperparameters were used during training:
50
+ - learning_rate: 0.0001
51
+ - train_batch_size: 32
52
+ - eval_batch_size: 32
53
+ - seed: 42
54
+ - optimizer: Use OptimizerNames.ADAMW_BNB with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
55
+ - lr_scheduler_type: linear
56
+ - lr_scheduler_warmup_ratio: 0.1
57
+ - num_epochs: 30
58
+
59
+ ### Training results
60
+
61
+ | Training Loss | Epoch | Step | Validation Loss | F1 Macro | F1 Micro | F1 Weighted | Precision Macro | Precision Micro | Precision Weighted | Recall Macro | Recall Micro | Recall Weighted | Accuracy |
62
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|:-----------:|:---------------:|:---------------:|:------------------:|:------------:|:------------:|:---------------:|:--------:|
63
+ | 1.9373 | 1.0 | 15 | 1.8818 | 0.0464 | 0.1894 | 0.0615 | 0.0277 | 0.1894 | 0.0367 | 0.1429 | 0.1894 | 0.1894 | 0.1894 |
64
+ | 1.869 | 2.0 | 30 | 1.8642 | 0.1100 | 0.2652 | 0.1418 | 0.075 | 0.2652 | 0.0968 | 0.2063 | 0.2652 | 0.2652 | 0.2652 |
65
+ | 1.9218 | 3.0 | 45 | 1.8754 | 0.1163 | 0.2576 | 0.1460 | 0.1316 | 0.2576 | 0.1566 | 0.1905 | 0.2576 | 0.2576 | 0.2576 |
66
+ | 1.6733 | 4.0 | 60 | 1.6881 | 0.2445 | 0.3864 | 0.3053 | 0.2427 | 0.3864 | 0.2917 | 0.2992 | 0.3864 | 0.3864 | 0.3864 |
67
+ | 1.54 | 5.0 | 75 | 1.5528 | 0.3252 | 0.4242 | 0.3856 | 0.3429 | 0.4242 | 0.4101 | 0.3570 | 0.4242 | 0.4242 | 0.4242 |
68
+ | 1.4418 | 6.0 | 90 | 1.5737 | 0.2858 | 0.3864 | 0.3213 | 0.2846 | 0.3864 | 0.3243 | 0.3398 | 0.3864 | 0.3864 | 0.3864 |
69
+ | 0.8592 | 7.0 | 105 | 1.5408 | 0.3444 | 0.4394 | 0.3965 | 0.3208 | 0.4394 | 0.3674 | 0.3791 | 0.4394 | 0.4394 | 0.4394 |
70
+ | 1.1427 | 8.0 | 120 | 1.2804 | 0.4638 | 0.5606 | 0.5317 | 0.4698 | 0.5606 | 0.5280 | 0.4831 | 0.5606 | 0.5606 | 0.5606 |
71
+ | 0.7849 | 9.0 | 135 | 1.2880 | 0.4649 | 0.5530 | 0.5291 | 0.4804 | 0.5530 | 0.5401 | 0.4823 | 0.5530 | 0.5530 | 0.5530 |
72
+ | 0.6846 | 10.0 | 150 | 1.3130 | 0.4298 | 0.5152 | 0.4811 | 0.4404 | 0.5152 | 0.5005 | 0.4671 | 0.5152 | 0.5152 | 0.5152 |
73
+ | 0.4006 | 11.0 | 165 | 1.2958 | 0.4931 | 0.5833 | 0.5598 | 0.4983 | 0.5833 | 0.5756 | 0.5229 | 0.5833 | 0.5833 | 0.5833 |
74
+ | 0.4329 | 12.0 | 180 | 1.2990 | 0.5062 | 0.5530 | 0.5562 | 0.5315 | 0.5530 | 0.5874 | 0.5133 | 0.5530 | 0.5530 | 0.5530 |
75
+ | 0.482 | 13.0 | 195 | 1.3831 | 0.4842 | 0.5152 | 0.5233 | 0.5517 | 0.5152 | 0.5803 | 0.4839 | 0.5152 | 0.5152 | 0.5152 |
76
+ | 0.6409 | 14.0 | 210 | 1.4066 | 0.5081 | 0.5985 | 0.5765 | 0.5194 | 0.5985 | 0.5820 | 0.5232 | 0.5985 | 0.5985 | 0.5985 |
77
+ | 0.3206 | 15.0 | 225 | 1.3690 | 0.5155 | 0.5606 | 0.5520 | 0.6158 | 0.5606 | 0.5890 | 0.5170 | 0.5606 | 0.5606 | 0.5606 |
78
+ | 0.1773 | 16.0 | 240 | 1.2568 | 0.5920 | 0.6515 | 0.6408 | 0.6894 | 0.6515 | 0.6623 | 0.5843 | 0.6515 | 0.6515 | 0.6515 |
79
+ | 0.3259 | 17.0 | 255 | 1.3406 | 0.5467 | 0.6061 | 0.5961 | 0.5615 | 0.6061 | 0.6033 | 0.5467 | 0.6061 | 0.6061 | 0.6061 |
80
+ | 0.1123 | 18.0 | 270 | 1.3767 | 0.5868 | 0.6364 | 0.6306 | 0.6258 | 0.6364 | 0.6413 | 0.5785 | 0.6364 | 0.6364 | 0.6364 |
81
+ | 0.1129 | 19.0 | 285 | 1.4680 | 0.5879 | 0.6439 | 0.6306 | 0.6809 | 0.6439 | 0.6933 | 0.5806 | 0.6439 | 0.6439 | 0.6439 |
82
+ | 0.0651 | 20.0 | 300 | 1.4981 | 0.6655 | 0.6894 | 0.6876 | 0.7115 | 0.6894 | 0.7224 | 0.6511 | 0.6894 | 0.6894 | 0.6894 |
83
+ | 0.0685 | 21.0 | 315 | 1.4621 | 0.6091 | 0.6515 | 0.6494 | 0.6303 | 0.6515 | 0.6641 | 0.6040 | 0.6515 | 0.6515 | 0.6515 |
84
+ | 0.1469 | 22.0 | 330 | 1.5347 | 0.5330 | 0.6212 | 0.6040 | 0.5477 | 0.6212 | 0.6149 | 0.5440 | 0.6212 | 0.6212 | 0.6212 |
85
+ | 0.0289 | 23.0 | 345 | 1.5417 | 0.5466 | 0.6288 | 0.6180 | 0.5409 | 0.6288 | 0.6108 | 0.5549 | 0.6288 | 0.6288 | 0.6288 |
86
+ | 0.01 | 24.0 | 360 | 1.5670 | 0.5475 | 0.6364 | 0.6187 | 0.5435 | 0.6364 | 0.6104 | 0.5594 | 0.6364 | 0.6364 | 0.6364 |
87
+ | 0.035 | 25.0 | 375 | 1.6037 | 0.5529 | 0.6364 | 0.6209 | 0.5470 | 0.6364 | 0.6156 | 0.5679 | 0.6364 | 0.6364 | 0.6364 |
88
+ | 0.0109 | 26.0 | 390 | 1.6752 | 0.5897 | 0.6212 | 0.6203 | 0.6145 | 0.6212 | 0.6527 | 0.6000 | 0.6212 | 0.6212 | 0.6212 |
89
+ | 0.038 | 27.0 | 405 | 1.6724 | 0.5344 | 0.6136 | 0.6008 | 0.5332 | 0.6136 | 0.6005 | 0.5468 | 0.6136 | 0.6136 | 0.6136 |
90
+ | 0.0116 | 28.0 | 420 | 1.6252 | 0.5384 | 0.6212 | 0.6090 | 0.5337 | 0.6212 | 0.6033 | 0.5491 | 0.6212 | 0.6212 | 0.6212 |
91
+ | 0.006 | 29.0 | 435 | 1.5980 | 0.5572 | 0.6364 | 0.6294 | 0.5529 | 0.6364 | 0.6246 | 0.5634 | 0.6364 | 0.6364 | 0.6364 |
92
+ | 0.0046 | 30.0 | 450 | 1.5939 | 0.5605 | 0.6439 | 0.6342 | 0.5546 | 0.6439 | 0.6269 | 0.5687 | 0.6439 | 0.6439 | 0.6439 |
93
+
94
+
95
+ ### Framework versions
96
+
97
+ - Transformers 4.48.2
98
+ - Pytorch 2.6.0+cu124
99
+ - Datasets 3.2.0
100
+ - Tokenizers 0.21.0
all_results.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.5757575757575758,
4
+ "eval_f1_macro": 0.5018613142408387,
5
+ "eval_f1_micro": 0.5757575757575758,
6
+ "eval_f1_weighted": 0.5678870521449232,
7
+ "eval_loss": 1.6241134405136108,
8
+ "eval_precision_macro": 0.5021312021312021,
9
+ "eval_precision_micro": 0.5757575757575758,
10
+ "eval_precision_weighted": 0.5657130748039839,
11
+ "eval_recall_macro": 0.507312925170068,
12
+ "eval_recall_micro": 0.5757575757575758,
13
+ "eval_recall_weighted": 0.5757575757575758,
14
+ "eval_runtime": 1.861,
15
+ "eval_samples_per_second": 35.465,
16
+ "eval_steps_per_second": 1.612,
17
+ "total_flos": 1.0740871074163507e+18,
18
+ "train_loss": 0.6009381743893027,
19
+ "train_runtime": 318.0836,
20
+ "train_samples_per_second": 43.573,
21
+ "train_steps_per_second": 1.415
22
+ }
config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "-",
13
+ "1": "0",
14
+ "2": "1",
15
+ "3": "2",
16
+ "4": "3",
17
+ "5": "4",
18
+ "6": "5"
19
+ },
20
+ "image_size": 224,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 3072,
23
+ "label2id": {
24
+ "-": "0",
25
+ "0": "1",
26
+ "1": "2",
27
+ "2": "3",
28
+ "3": "4",
29
+ "4": "5",
30
+ "5": "6"
31
+ },
32
+ "layer_norm_eps": 1e-12,
33
+ "model_type": "vit",
34
+ "num_attention_heads": 12,
35
+ "num_channels": 3,
36
+ "num_hidden_layers": 12,
37
+ "patch_size": 16,
38
+ "problem_type": "single_label_classification",
39
+ "qkv_bias": true,
40
+ "torch_dtype": "float32",
41
+ "transformers_version": "4.48.2"
42
+ }
eval_results.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "eval_accuracy": 0.5757575757575758,
4
+ "eval_f1_macro": 0.5018613142408387,
5
+ "eval_f1_micro": 0.5757575757575758,
6
+ "eval_f1_weighted": 0.5678870521449232,
7
+ "eval_loss": 1.6241134405136108,
8
+ "eval_precision_macro": 0.5021312021312021,
9
+ "eval_precision_micro": 0.5757575757575758,
10
+ "eval_precision_weighted": 0.5657130748039839,
11
+ "eval_recall_macro": 0.507312925170068,
12
+ "eval_recall_micro": 0.5757575757575758,
13
+ "eval_recall_weighted": 0.5757575757575758,
14
+ "eval_runtime": 1.861,
15
+ "eval_samples_per_second": 35.465,
16
+ "eval_steps_per_second": 1.612
17
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65861e669959f8c3f7f1bdbce403590929e54edd1ddeaa1660e460a3fca9e422
3
+ size 343239356
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "ViTImageProcessorFast",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "resample": 2,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 224,
21
+ "width": 224
22
+ }
23
+ }
runs/Feb03_18-39-25_modal/events.out.tfevents.1738607969.modal.2.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cd989b88e11140fc6612945f3f7d2d919ead3e8579dedebdc02d74bd664ecf7
3
+ size 77614
runs/Feb03_18-39-25_modal/events.out.tfevents.1738607969.modal.2.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:60506e5bdcf9035c18a7a76f00ef4c884b93b73d0cd6160a4e0846b0bd8b84cd
3
+ size 77614
runs/Feb03_18-39-25_modal/events.out.tfevents.1738608289.modal.2.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8a81371ec31b0a5fd49b41237e342cd559386d392f1f383ae7e97a52a39b236
3
+ size 921
runs/Feb03_18-39-25_modal/events.out.tfevents.1738608289.modal.2.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b59892af0c2cb0cc3ad9df5062ebc403f3a72c05b2609845d133345e4574707
3
+ size 921
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 30.0,
3
+ "total_flos": 1.0740871074163507e+18,
4
+ "train_loss": 0.6009381743893027,
5
+ "train_runtime": 318.0836,
6
+ "train_samples_per_second": 43.573,
7
+ "train_steps_per_second": 1.415
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,2157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.2568100690841675,
3
+ "best_model_checkpoint": "square_run_32_batch/checkpoint-240",
4
+ "epoch": 30.0,
5
+ "eval_steps": 500,
6
+ "global_step": 450,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.13333333333333333,
13
+ "grad_norm": 6.848705291748047,
14
+ "learning_rate": 4.444444444444445e-06,
15
+ "loss": 2.0193,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.26666666666666666,
20
+ "grad_norm": 5.4228925704956055,
21
+ "learning_rate": 8.88888888888889e-06,
22
+ "loss": 2.1013,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.4,
27
+ "grad_norm": 4.7422590255737305,
28
+ "learning_rate": 1.3333333333333333e-05,
29
+ "loss": 1.9569,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.5333333333333333,
34
+ "grad_norm": 5.0756025314331055,
35
+ "learning_rate": 1.777777777777778e-05,
36
+ "loss": 1.9677,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.6666666666666666,
41
+ "grad_norm": 7.1845269203186035,
42
+ "learning_rate": 2.2222222222222223e-05,
43
+ "loss": 1.884,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.8,
48
+ "grad_norm": 4.051375865936279,
49
+ "learning_rate": 2.6666666666666667e-05,
50
+ "loss": 1.9017,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.9333333333333333,
55
+ "grad_norm": 5.293440341949463,
56
+ "learning_rate": 3.111111111111111e-05,
57
+ "loss": 1.9373,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 1.0,
62
+ "eval_accuracy": 0.1893939393939394,
63
+ "eval_f1_macro": 0.04638218923933209,
64
+ "eval_f1_micro": 0.1893939393939394,
65
+ "eval_f1_weighted": 0.06149153876426603,
66
+ "eval_loss": 1.8818118572235107,
67
+ "eval_precision_macro": 0.027685492801771874,
68
+ "eval_precision_micro": 0.1893939393939394,
69
+ "eval_precision_weighted": 0.03670425182053089,
70
+ "eval_recall_macro": 0.14285714285714285,
71
+ "eval_recall_micro": 0.1893939393939394,
72
+ "eval_recall_weighted": 0.1893939393939394,
73
+ "eval_runtime": 2.2684,
74
+ "eval_samples_per_second": 58.192,
75
+ "eval_steps_per_second": 2.204,
76
+ "step": 15
77
+ },
78
+ {
79
+ "epoch": 1.0666666666666667,
80
+ "grad_norm": 3.4230170249938965,
81
+ "learning_rate": 3.555555555555556e-05,
82
+ "loss": 1.9179,
83
+ "step": 16
84
+ },
85
+ {
86
+ "epoch": 1.2,
87
+ "grad_norm": 3.4030396938323975,
88
+ "learning_rate": 4e-05,
89
+ "loss": 1.8139,
90
+ "step": 18
91
+ },
92
+ {
93
+ "epoch": 1.3333333333333333,
94
+ "grad_norm": 4.195278167724609,
95
+ "learning_rate": 4.4444444444444447e-05,
96
+ "loss": 1.8941,
97
+ "step": 20
98
+ },
99
+ {
100
+ "epoch": 1.4666666666666668,
101
+ "grad_norm": 3.2356927394866943,
102
+ "learning_rate": 4.888888888888889e-05,
103
+ "loss": 1.8915,
104
+ "step": 22
105
+ },
106
+ {
107
+ "epoch": 1.6,
108
+ "grad_norm": 3.322704315185547,
109
+ "learning_rate": 5.333333333333333e-05,
110
+ "loss": 1.917,
111
+ "step": 24
112
+ },
113
+ {
114
+ "epoch": 1.7333333333333334,
115
+ "grad_norm": 3.293910026550293,
116
+ "learning_rate": 5.7777777777777776e-05,
117
+ "loss": 1.8943,
118
+ "step": 26
119
+ },
120
+ {
121
+ "epoch": 1.8666666666666667,
122
+ "grad_norm": 4.803905487060547,
123
+ "learning_rate": 6.222222222222222e-05,
124
+ "loss": 1.8841,
125
+ "step": 28
126
+ },
127
+ {
128
+ "epoch": 2.0,
129
+ "grad_norm": 4.006722927093506,
130
+ "learning_rate": 6.666666666666667e-05,
131
+ "loss": 1.869,
132
+ "step": 30
133
+ },
134
+ {
135
+ "epoch": 2.0,
136
+ "eval_accuracy": 0.26515151515151514,
137
+ "eval_f1_macro": 0.10998877665544333,
138
+ "eval_f1_micro": 0.26515151515151514,
139
+ "eval_f1_weighted": 0.14177124783185388,
140
+ "eval_loss": 1.864223599433899,
141
+ "eval_precision_macro": 0.075,
142
+ "eval_precision_micro": 0.26515151515151514,
143
+ "eval_precision_weighted": 0.09678030303030305,
144
+ "eval_recall_macro": 0.20634920634920634,
145
+ "eval_recall_micro": 0.26515151515151514,
146
+ "eval_recall_weighted": 0.26515151515151514,
147
+ "eval_runtime": 1.8754,
148
+ "eval_samples_per_second": 70.384,
149
+ "eval_steps_per_second": 2.666,
150
+ "step": 30
151
+ },
152
+ {
153
+ "epoch": 2.1333333333333333,
154
+ "grad_norm": 2.15875506401062,
155
+ "learning_rate": 7.111111111111112e-05,
156
+ "loss": 1.8686,
157
+ "step": 32
158
+ },
159
+ {
160
+ "epoch": 2.2666666666666666,
161
+ "grad_norm": 2.8864150047302246,
162
+ "learning_rate": 7.555555555555556e-05,
163
+ "loss": 1.8652,
164
+ "step": 34
165
+ },
166
+ {
167
+ "epoch": 2.4,
168
+ "grad_norm": 3.819974899291992,
169
+ "learning_rate": 8e-05,
170
+ "loss": 1.818,
171
+ "step": 36
172
+ },
173
+ {
174
+ "epoch": 2.533333333333333,
175
+ "grad_norm": 7.35491418838501,
176
+ "learning_rate": 8.444444444444444e-05,
177
+ "loss": 1.9347,
178
+ "step": 38
179
+ },
180
+ {
181
+ "epoch": 2.6666666666666665,
182
+ "grad_norm": 5.84605598449707,
183
+ "learning_rate": 8.888888888888889e-05,
184
+ "loss": 1.8508,
185
+ "step": 40
186
+ },
187
+ {
188
+ "epoch": 2.8,
189
+ "grad_norm": 2.4050137996673584,
190
+ "learning_rate": 9.333333333333334e-05,
191
+ "loss": 1.8884,
192
+ "step": 42
193
+ },
194
+ {
195
+ "epoch": 2.9333333333333336,
196
+ "grad_norm": 5.182938575744629,
197
+ "learning_rate": 9.777777777777778e-05,
198
+ "loss": 1.9218,
199
+ "step": 44
200
+ },
201
+ {
202
+ "epoch": 3.0,
203
+ "eval_accuracy": 0.25757575757575757,
204
+ "eval_f1_macro": 0.11628985865833667,
205
+ "eval_f1_micro": 0.25757575757575757,
206
+ "eval_f1_weighted": 0.1459780747505363,
207
+ "eval_loss": 1.8754385709762573,
208
+ "eval_precision_macro": 0.131615925058548,
209
+ "eval_precision_micro": 0.25757575757575757,
210
+ "eval_precision_weighted": 0.15663189269746647,
211
+ "eval_recall_macro": 0.19047619047619047,
212
+ "eval_recall_micro": 0.25757575757575757,
213
+ "eval_recall_weighted": 0.25757575757575757,
214
+ "eval_runtime": 1.9051,
215
+ "eval_samples_per_second": 69.286,
216
+ "eval_steps_per_second": 2.624,
217
+ "step": 45
218
+ },
219
+ {
220
+ "epoch": 3.066666666666667,
221
+ "grad_norm": 5.7963128089904785,
222
+ "learning_rate": 9.97530864197531e-05,
223
+ "loss": 1.9754,
224
+ "step": 46
225
+ },
226
+ {
227
+ "epoch": 3.2,
228
+ "grad_norm": 5.1605048179626465,
229
+ "learning_rate": 9.925925925925926e-05,
230
+ "loss": 1.8577,
231
+ "step": 48
232
+ },
233
+ {
234
+ "epoch": 3.3333333333333335,
235
+ "grad_norm": 3.756355047225952,
236
+ "learning_rate": 9.876543209876543e-05,
237
+ "loss": 1.8707,
238
+ "step": 50
239
+ },
240
+ {
241
+ "epoch": 3.466666666666667,
242
+ "grad_norm": 2.5353710651397705,
243
+ "learning_rate": 9.827160493827162e-05,
244
+ "loss": 1.7918,
245
+ "step": 52
246
+ },
247
+ {
248
+ "epoch": 3.6,
249
+ "grad_norm": 4.181753635406494,
250
+ "learning_rate": 9.777777777777778e-05,
251
+ "loss": 1.8251,
252
+ "step": 54
253
+ },
254
+ {
255
+ "epoch": 3.7333333333333334,
256
+ "grad_norm": 2.4634644985198975,
257
+ "learning_rate": 9.728395061728396e-05,
258
+ "loss": 1.7713,
259
+ "step": 56
260
+ },
261
+ {
262
+ "epoch": 3.8666666666666667,
263
+ "grad_norm": 8.700553894042969,
264
+ "learning_rate": 9.679012345679013e-05,
265
+ "loss": 1.8962,
266
+ "step": 58
267
+ },
268
+ {
269
+ "epoch": 4.0,
270
+ "grad_norm": 5.921916484832764,
271
+ "learning_rate": 9.62962962962963e-05,
272
+ "loss": 1.6733,
273
+ "step": 60
274
+ },
275
+ {
276
+ "epoch": 4.0,
277
+ "eval_accuracy": 0.38636363636363635,
278
+ "eval_f1_macro": 0.2445293836598184,
279
+ "eval_f1_micro": 0.38636363636363635,
280
+ "eval_f1_weighted": 0.3052538765582244,
281
+ "eval_loss": 1.6881486177444458,
282
+ "eval_precision_macro": 0.24274221103966703,
283
+ "eval_precision_micro": 0.38636363636363635,
284
+ "eval_precision_weighted": 0.2917426054412356,
285
+ "eval_recall_macro": 0.2992441421012849,
286
+ "eval_recall_micro": 0.38636363636363635,
287
+ "eval_recall_weighted": 0.38636363636363635,
288
+ "eval_runtime": 1.8719,
289
+ "eval_samples_per_second": 70.516,
290
+ "eval_steps_per_second": 2.671,
291
+ "step": 60
292
+ },
293
+ {
294
+ "epoch": 4.133333333333334,
295
+ "grad_norm": 3.3924758434295654,
296
+ "learning_rate": 9.580246913580247e-05,
297
+ "loss": 1.5941,
298
+ "step": 62
299
+ },
300
+ {
301
+ "epoch": 4.266666666666667,
302
+ "grad_norm": 6.785348415374756,
303
+ "learning_rate": 9.530864197530865e-05,
304
+ "loss": 1.582,
305
+ "step": 64
306
+ },
307
+ {
308
+ "epoch": 4.4,
309
+ "grad_norm": 4.813143730163574,
310
+ "learning_rate": 9.481481481481483e-05,
311
+ "loss": 1.649,
312
+ "step": 66
313
+ },
314
+ {
315
+ "epoch": 4.533333333333333,
316
+ "grad_norm": 5.351255893707275,
317
+ "learning_rate": 9.432098765432099e-05,
318
+ "loss": 1.6271,
319
+ "step": 68
320
+ },
321
+ {
322
+ "epoch": 4.666666666666667,
323
+ "grad_norm": 11.194862365722656,
324
+ "learning_rate": 9.382716049382717e-05,
325
+ "loss": 1.7395,
326
+ "step": 70
327
+ },
328
+ {
329
+ "epoch": 4.8,
330
+ "grad_norm": 6.263866424560547,
331
+ "learning_rate": 9.333333333333334e-05,
332
+ "loss": 1.4422,
333
+ "step": 72
334
+ },
335
+ {
336
+ "epoch": 4.933333333333334,
337
+ "grad_norm": 8.602386474609375,
338
+ "learning_rate": 9.28395061728395e-05,
339
+ "loss": 1.54,
340
+ "step": 74
341
+ },
342
+ {
343
+ "epoch": 5.0,
344
+ "eval_accuracy": 0.42424242424242425,
345
+ "eval_f1_macro": 0.32515713851372885,
346
+ "eval_f1_micro": 0.42424242424242425,
347
+ "eval_f1_weighted": 0.38558697740383735,
348
+ "eval_loss": 1.5528110265731812,
349
+ "eval_precision_macro": 0.34291374508765815,
350
+ "eval_precision_micro": 0.42424242424242425,
351
+ "eval_precision_weighted": 0.4100833883442579,
352
+ "eval_recall_macro": 0.35698412698412696,
353
+ "eval_recall_micro": 0.42424242424242425,
354
+ "eval_recall_weighted": 0.42424242424242425,
355
+ "eval_runtime": 1.9443,
356
+ "eval_samples_per_second": 67.891,
357
+ "eval_steps_per_second": 2.572,
358
+ "step": 75
359
+ },
360
+ {
361
+ "epoch": 5.066666666666666,
362
+ "grad_norm": 7.738183498382568,
363
+ "learning_rate": 9.234567901234568e-05,
364
+ "loss": 1.6152,
365
+ "step": 76
366
+ },
367
+ {
368
+ "epoch": 5.2,
369
+ "grad_norm": 7.564102649688721,
370
+ "learning_rate": 9.185185185185186e-05,
371
+ "loss": 1.4993,
372
+ "step": 78
373
+ },
374
+ {
375
+ "epoch": 5.333333333333333,
376
+ "grad_norm": 8.335043907165527,
377
+ "learning_rate": 9.135802469135802e-05,
378
+ "loss": 1.494,
379
+ "step": 80
380
+ },
381
+ {
382
+ "epoch": 5.466666666666667,
383
+ "grad_norm": 6.382967948913574,
384
+ "learning_rate": 9.08641975308642e-05,
385
+ "loss": 1.4944,
386
+ "step": 82
387
+ },
388
+ {
389
+ "epoch": 5.6,
390
+ "grad_norm": 7.259094715118408,
391
+ "learning_rate": 9.037037037037038e-05,
392
+ "loss": 1.3191,
393
+ "step": 84
394
+ },
395
+ {
396
+ "epoch": 5.733333333333333,
397
+ "grad_norm": 4.972009658813477,
398
+ "learning_rate": 8.987654320987655e-05,
399
+ "loss": 1.3894,
400
+ "step": 86
401
+ },
402
+ {
403
+ "epoch": 5.866666666666667,
404
+ "grad_norm": 9.250694274902344,
405
+ "learning_rate": 8.938271604938272e-05,
406
+ "loss": 1.3979,
407
+ "step": 88
408
+ },
409
+ {
410
+ "epoch": 6.0,
411
+ "grad_norm": 7.207069396972656,
412
+ "learning_rate": 8.888888888888889e-05,
413
+ "loss": 1.4418,
414
+ "step": 90
415
+ },
416
+ {
417
+ "epoch": 6.0,
418
+ "eval_accuracy": 0.38636363636363635,
419
+ "eval_f1_macro": 0.285838283865586,
420
+ "eval_f1_micro": 0.38636363636363635,
421
+ "eval_f1_weighted": 0.3212562379097532,
422
+ "eval_loss": 1.5736558437347412,
423
+ "eval_precision_macro": 0.284608858206039,
424
+ "eval_precision_micro": 0.38636363636363635,
425
+ "eval_precision_weighted": 0.3242631096693543,
426
+ "eval_recall_macro": 0.3398034769463341,
427
+ "eval_recall_micro": 0.38636363636363635,
428
+ "eval_recall_weighted": 0.38636363636363635,
429
+ "eval_runtime": 1.8593,
430
+ "eval_samples_per_second": 70.995,
431
+ "eval_steps_per_second": 2.689,
432
+ "step": 90
433
+ },
434
+ {
435
+ "epoch": 6.133333333333334,
436
+ "grad_norm": 4.753687381744385,
437
+ "learning_rate": 8.839506172839507e-05,
438
+ "loss": 1.3218,
439
+ "step": 92
440
+ },
441
+ {
442
+ "epoch": 6.266666666666667,
443
+ "grad_norm": 5.942229747772217,
444
+ "learning_rate": 8.790123456790123e-05,
445
+ "loss": 1.3995,
446
+ "step": 94
447
+ },
448
+ {
449
+ "epoch": 6.4,
450
+ "grad_norm": 4.026015281677246,
451
+ "learning_rate": 8.740740740740741e-05,
452
+ "loss": 1.3155,
453
+ "step": 96
454
+ },
455
+ {
456
+ "epoch": 6.533333333333333,
457
+ "grad_norm": 4.893887042999268,
458
+ "learning_rate": 8.691358024691359e-05,
459
+ "loss": 1.2009,
460
+ "step": 98
461
+ },
462
+ {
463
+ "epoch": 6.666666666666667,
464
+ "grad_norm": 3.904926061630249,
465
+ "learning_rate": 8.641975308641975e-05,
466
+ "loss": 0.98,
467
+ "step": 100
468
+ },
469
+ {
470
+ "epoch": 6.8,
471
+ "grad_norm": 4.266864776611328,
472
+ "learning_rate": 8.592592592592593e-05,
473
+ "loss": 1.0803,
474
+ "step": 102
475
+ },
476
+ {
477
+ "epoch": 6.933333333333334,
478
+ "grad_norm": 5.24403190612793,
479
+ "learning_rate": 8.54320987654321e-05,
480
+ "loss": 0.8592,
481
+ "step": 104
482
+ },
483
+ {
484
+ "epoch": 7.0,
485
+ "eval_accuracy": 0.4393939393939394,
486
+ "eval_f1_macro": 0.3443599467808913,
487
+ "eval_f1_micro": 0.4393939393939394,
488
+ "eval_f1_weighted": 0.39645908811500513,
489
+ "eval_loss": 1.5408130884170532,
490
+ "eval_precision_macro": 0.32083233878346656,
491
+ "eval_precision_micro": 0.4393939393939394,
492
+ "eval_precision_weighted": 0.36735850041771095,
493
+ "eval_recall_macro": 0.37913832199546477,
494
+ "eval_recall_micro": 0.4393939393939394,
495
+ "eval_recall_weighted": 0.4393939393939394,
496
+ "eval_runtime": 1.9485,
497
+ "eval_samples_per_second": 67.743,
498
+ "eval_steps_per_second": 2.566,
499
+ "step": 105
500
+ },
501
+ {
502
+ "epoch": 7.066666666666666,
503
+ "grad_norm": 5.595825672149658,
504
+ "learning_rate": 8.493827160493828e-05,
505
+ "loss": 1.0203,
506
+ "step": 106
507
+ },
508
+ {
509
+ "epoch": 7.2,
510
+ "grad_norm": 5.34617805480957,
511
+ "learning_rate": 8.444444444444444e-05,
512
+ "loss": 1.0819,
513
+ "step": 108
514
+ },
515
+ {
516
+ "epoch": 7.333333333333333,
517
+ "grad_norm": 6.987905025482178,
518
+ "learning_rate": 8.395061728395062e-05,
519
+ "loss": 1.1165,
520
+ "step": 110
521
+ },
522
+ {
523
+ "epoch": 7.466666666666667,
524
+ "grad_norm": 6.039572715759277,
525
+ "learning_rate": 8.34567901234568e-05,
526
+ "loss": 1.0403,
527
+ "step": 112
528
+ },
529
+ {
530
+ "epoch": 7.6,
531
+ "grad_norm": 6.031858444213867,
532
+ "learning_rate": 8.296296296296296e-05,
533
+ "loss": 0.9709,
534
+ "step": 114
535
+ },
536
+ {
537
+ "epoch": 7.733333333333333,
538
+ "grad_norm": 6.656283855438232,
539
+ "learning_rate": 8.246913580246915e-05,
540
+ "loss": 0.8358,
541
+ "step": 116
542
+ },
543
+ {
544
+ "epoch": 7.866666666666667,
545
+ "grad_norm": 6.286685943603516,
546
+ "learning_rate": 8.197530864197531e-05,
547
+ "loss": 1.146,
548
+ "step": 118
549
+ },
550
+ {
551
+ "epoch": 8.0,
552
+ "grad_norm": 9.892986297607422,
553
+ "learning_rate": 8.148148148148148e-05,
554
+ "loss": 1.1427,
555
+ "step": 120
556
+ },
557
+ {
558
+ "epoch": 8.0,
559
+ "eval_accuracy": 0.5606060606060606,
560
+ "eval_f1_macro": 0.46377203827822905,
561
+ "eval_f1_micro": 0.5606060606060606,
562
+ "eval_f1_weighted": 0.5317054176401353,
563
+ "eval_loss": 1.2803829908370972,
564
+ "eval_precision_macro": 0.469819473380193,
565
+ "eval_precision_micro": 0.5606060606060606,
566
+ "eval_precision_weighted": 0.5280005916463256,
567
+ "eval_recall_macro": 0.4830687830687831,
568
+ "eval_recall_micro": 0.5606060606060606,
569
+ "eval_recall_weighted": 0.5606060606060606,
570
+ "eval_runtime": 1.9474,
571
+ "eval_samples_per_second": 67.784,
572
+ "eval_steps_per_second": 2.568,
573
+ "step": 120
574
+ },
575
+ {
576
+ "epoch": 8.133333333333333,
577
+ "grad_norm": 4.904130458831787,
578
+ "learning_rate": 8.098765432098767e-05,
579
+ "loss": 0.8933,
580
+ "step": 122
581
+ },
582
+ {
583
+ "epoch": 8.266666666666667,
584
+ "grad_norm": 4.419686794281006,
585
+ "learning_rate": 8.049382716049383e-05,
586
+ "loss": 0.9245,
587
+ "step": 124
588
+ },
589
+ {
590
+ "epoch": 8.4,
591
+ "grad_norm": 8.33668041229248,
592
+ "learning_rate": 8e-05,
593
+ "loss": 0.8385,
594
+ "step": 126
595
+ },
596
+ {
597
+ "epoch": 8.533333333333333,
598
+ "grad_norm": 8.35203742980957,
599
+ "learning_rate": 7.950617283950618e-05,
600
+ "loss": 0.9428,
601
+ "step": 128
602
+ },
603
+ {
604
+ "epoch": 8.666666666666666,
605
+ "grad_norm": 5.724539279937744,
606
+ "learning_rate": 7.901234567901235e-05,
607
+ "loss": 0.7591,
608
+ "step": 130
609
+ },
610
+ {
611
+ "epoch": 8.8,
612
+ "grad_norm": 8.662413597106934,
613
+ "learning_rate": 7.851851851851852e-05,
614
+ "loss": 0.995,
615
+ "step": 132
616
+ },
617
+ {
618
+ "epoch": 8.933333333333334,
619
+ "grad_norm": 5.197690010070801,
620
+ "learning_rate": 7.802469135802469e-05,
621
+ "loss": 0.7849,
622
+ "step": 134
623
+ },
624
+ {
625
+ "epoch": 9.0,
626
+ "eval_accuracy": 0.553030303030303,
627
+ "eval_f1_macro": 0.46486536691732006,
628
+ "eval_f1_micro": 0.553030303030303,
629
+ "eval_f1_weighted": 0.529141811901771,
630
+ "eval_loss": 1.2879999876022339,
631
+ "eval_precision_macro": 0.48036078903674717,
632
+ "eval_precision_micro": 0.553030303030303,
633
+ "eval_precision_weighted": 0.540143107077697,
634
+ "eval_recall_macro": 0.4822675736961451,
635
+ "eval_recall_micro": 0.553030303030303,
636
+ "eval_recall_weighted": 0.553030303030303,
637
+ "eval_runtime": 1.9912,
638
+ "eval_samples_per_second": 66.292,
639
+ "eval_steps_per_second": 2.511,
640
+ "step": 135
641
+ },
642
+ {
643
+ "epoch": 9.066666666666666,
644
+ "grad_norm": 7.03670597076416,
645
+ "learning_rate": 7.753086419753088e-05,
646
+ "loss": 0.8049,
647
+ "step": 136
648
+ },
649
+ {
650
+ "epoch": 9.2,
651
+ "grad_norm": 5.591729640960693,
652
+ "learning_rate": 7.703703703703704e-05,
653
+ "loss": 0.9341,
654
+ "step": 138
655
+ },
656
+ {
657
+ "epoch": 9.333333333333334,
658
+ "grad_norm": 6.677962303161621,
659
+ "learning_rate": 7.65432098765432e-05,
660
+ "loss": 0.7679,
661
+ "step": 140
662
+ },
663
+ {
664
+ "epoch": 9.466666666666667,
665
+ "grad_norm": 5.4789934158325195,
666
+ "learning_rate": 7.60493827160494e-05,
667
+ "loss": 0.7773,
668
+ "step": 142
669
+ },
670
+ {
671
+ "epoch": 9.6,
672
+ "grad_norm": 5.957266330718994,
673
+ "learning_rate": 7.555555555555556e-05,
674
+ "loss": 0.638,
675
+ "step": 144
676
+ },
677
+ {
678
+ "epoch": 9.733333333333333,
679
+ "grad_norm": 5.691118240356445,
680
+ "learning_rate": 7.506172839506173e-05,
681
+ "loss": 0.7762,
682
+ "step": 146
683
+ },
684
+ {
685
+ "epoch": 9.866666666666667,
686
+ "grad_norm": 6.8899827003479,
687
+ "learning_rate": 7.456790123456791e-05,
688
+ "loss": 0.9012,
689
+ "step": 148
690
+ },
691
+ {
692
+ "epoch": 10.0,
693
+ "grad_norm": 7.408969402313232,
694
+ "learning_rate": 7.407407407407407e-05,
695
+ "loss": 0.6846,
696
+ "step": 150
697
+ },
698
+ {
699
+ "epoch": 10.0,
700
+ "eval_accuracy": 0.5151515151515151,
701
+ "eval_f1_macro": 0.42983280392444223,
702
+ "eval_f1_micro": 0.5151515151515151,
703
+ "eval_f1_weighted": 0.48105498068393227,
704
+ "eval_loss": 1.3130199909210205,
705
+ "eval_precision_macro": 0.4404005812415951,
706
+ "eval_precision_micro": 0.5151515151515151,
707
+ "eval_precision_weighted": 0.5005354338015628,
708
+ "eval_recall_macro": 0.4670748299319728,
709
+ "eval_recall_micro": 0.5151515151515151,
710
+ "eval_recall_weighted": 0.5151515151515151,
711
+ "eval_runtime": 1.9855,
712
+ "eval_samples_per_second": 66.482,
713
+ "eval_steps_per_second": 2.518,
714
+ "step": 150
715
+ },
716
+ {
717
+ "epoch": 10.133333333333333,
718
+ "grad_norm": 5.070552825927734,
719
+ "learning_rate": 7.358024691358025e-05,
720
+ "loss": 0.6116,
721
+ "step": 152
722
+ },
723
+ {
724
+ "epoch": 10.266666666666667,
725
+ "grad_norm": 4.844223499298096,
726
+ "learning_rate": 7.308641975308643e-05,
727
+ "loss": 0.6517,
728
+ "step": 154
729
+ },
730
+ {
731
+ "epoch": 10.4,
732
+ "grad_norm": 3.965522289276123,
733
+ "learning_rate": 7.25925925925926e-05,
734
+ "loss": 0.5573,
735
+ "step": 156
736
+ },
737
+ {
738
+ "epoch": 10.533333333333333,
739
+ "grad_norm": 7.53262996673584,
740
+ "learning_rate": 7.209876543209877e-05,
741
+ "loss": 0.7258,
742
+ "step": 158
743
+ },
744
+ {
745
+ "epoch": 10.666666666666666,
746
+ "grad_norm": 6.725161552429199,
747
+ "learning_rate": 7.160493827160494e-05,
748
+ "loss": 0.8109,
749
+ "step": 160
750
+ },
751
+ {
752
+ "epoch": 10.8,
753
+ "grad_norm": 8.250865936279297,
754
+ "learning_rate": 7.111111111111112e-05,
755
+ "loss": 0.8596,
756
+ "step": 162
757
+ },
758
+ {
759
+ "epoch": 10.933333333333334,
760
+ "grad_norm": 4.163515567779541,
761
+ "learning_rate": 7.061728395061728e-05,
762
+ "loss": 0.4006,
763
+ "step": 164
764
+ },
765
+ {
766
+ "epoch": 11.0,
767
+ "eval_accuracy": 0.5833333333333334,
768
+ "eval_f1_macro": 0.49308835780529725,
769
+ "eval_f1_micro": 0.5833333333333334,
770
+ "eval_f1_weighted": 0.5597960736751899,
771
+ "eval_loss": 1.295769214630127,
772
+ "eval_precision_macro": 0.498317425896604,
773
+ "eval_precision_micro": 0.5833333333333334,
774
+ "eval_precision_weighted": 0.5756076561299337,
775
+ "eval_recall_macro": 0.5229024943310657,
776
+ "eval_recall_micro": 0.5833333333333334,
777
+ "eval_recall_weighted": 0.5833333333333334,
778
+ "eval_runtime": 1.9133,
779
+ "eval_samples_per_second": 68.991,
780
+ "eval_steps_per_second": 2.613,
781
+ "step": 165
782
+ },
783
+ {
784
+ "epoch": 11.066666666666666,
785
+ "grad_norm": 4.829576015472412,
786
+ "learning_rate": 7.012345679012346e-05,
787
+ "loss": 0.6355,
788
+ "step": 166
789
+ },
790
+ {
791
+ "epoch": 11.2,
792
+ "grad_norm": 5.353898525238037,
793
+ "learning_rate": 6.962962962962964e-05,
794
+ "loss": 0.4955,
795
+ "step": 168
796
+ },
797
+ {
798
+ "epoch": 11.333333333333334,
799
+ "grad_norm": 5.44912052154541,
800
+ "learning_rate": 6.91358024691358e-05,
801
+ "loss": 0.4833,
802
+ "step": 170
803
+ },
804
+ {
805
+ "epoch": 11.466666666666667,
806
+ "grad_norm": 5.900742530822754,
807
+ "learning_rate": 6.864197530864198e-05,
808
+ "loss": 0.5752,
809
+ "step": 172
810
+ },
811
+ {
812
+ "epoch": 11.6,
813
+ "grad_norm": 6.004303455352783,
814
+ "learning_rate": 6.814814814814815e-05,
815
+ "loss": 0.5738,
816
+ "step": 174
817
+ },
818
+ {
819
+ "epoch": 11.733333333333333,
820
+ "grad_norm": 3.937319040298462,
821
+ "learning_rate": 6.765432098765433e-05,
822
+ "loss": 0.4661,
823
+ "step": 176
824
+ },
825
+ {
826
+ "epoch": 11.866666666666667,
827
+ "grad_norm": 4.814683437347412,
828
+ "learning_rate": 6.716049382716049e-05,
829
+ "loss": 0.5694,
830
+ "step": 178
831
+ },
832
+ {
833
+ "epoch": 12.0,
834
+ "grad_norm": 6.7769880294799805,
835
+ "learning_rate": 6.666666666666667e-05,
836
+ "loss": 0.4329,
837
+ "step": 180
838
+ },
839
+ {
840
+ "epoch": 12.0,
841
+ "eval_accuracy": 0.553030303030303,
842
+ "eval_f1_macro": 0.506246746427407,
843
+ "eval_f1_micro": 0.553030303030303,
844
+ "eval_f1_weighted": 0.5561970515744254,
845
+ "eval_loss": 1.299007773399353,
846
+ "eval_precision_macro": 0.5314684490530354,
847
+ "eval_precision_micro": 0.553030303030303,
848
+ "eval_precision_weighted": 0.5874290165244113,
849
+ "eval_recall_macro": 0.5133106575963718,
850
+ "eval_recall_micro": 0.553030303030303,
851
+ "eval_recall_weighted": 0.553030303030303,
852
+ "eval_runtime": 2.0372,
853
+ "eval_samples_per_second": 64.793,
854
+ "eval_steps_per_second": 2.454,
855
+ "step": 180
856
+ },
857
+ {
858
+ "epoch": 12.133333333333333,
859
+ "grad_norm": 5.787886619567871,
860
+ "learning_rate": 6.617283950617285e-05,
861
+ "loss": 0.5719,
862
+ "step": 182
863
+ },
864
+ {
865
+ "epoch": 12.266666666666667,
866
+ "grad_norm": 2.843268632888794,
867
+ "learning_rate": 6.567901234567901e-05,
868
+ "loss": 0.4646,
869
+ "step": 184
870
+ },
871
+ {
872
+ "epoch": 12.4,
873
+ "grad_norm": 4.530274391174316,
874
+ "learning_rate": 6.51851851851852e-05,
875
+ "loss": 0.3544,
876
+ "step": 186
877
+ },
878
+ {
879
+ "epoch": 12.533333333333333,
880
+ "grad_norm": 5.348933696746826,
881
+ "learning_rate": 6.469135802469136e-05,
882
+ "loss": 0.3957,
883
+ "step": 188
884
+ },
885
+ {
886
+ "epoch": 12.666666666666666,
887
+ "grad_norm": 7.746328830718994,
888
+ "learning_rate": 6.419753086419753e-05,
889
+ "loss": 0.4989,
890
+ "step": 190
891
+ },
892
+ {
893
+ "epoch": 12.8,
894
+ "grad_norm": 6.134746074676514,
895
+ "learning_rate": 6.37037037037037e-05,
896
+ "loss": 0.7035,
897
+ "step": 192
898
+ },
899
+ {
900
+ "epoch": 12.933333333333334,
901
+ "grad_norm": 5.567310810089111,
902
+ "learning_rate": 6.320987654320988e-05,
903
+ "loss": 0.482,
904
+ "step": 194
905
+ },
906
+ {
907
+ "epoch": 13.0,
908
+ "eval_accuracy": 0.5151515151515151,
909
+ "eval_f1_macro": 0.4842067834885892,
910
+ "eval_f1_micro": 0.5151515151515151,
911
+ "eval_f1_weighted": 0.5233183119383529,
912
+ "eval_loss": 1.3830989599227905,
913
+ "eval_precision_macro": 0.5517290249433106,
914
+ "eval_precision_micro": 0.5151515151515151,
915
+ "eval_precision_weighted": 0.5803270803270804,
916
+ "eval_recall_macro": 0.48390778533635675,
917
+ "eval_recall_micro": 0.5151515151515151,
918
+ "eval_recall_weighted": 0.5151515151515151,
919
+ "eval_runtime": 2.882,
920
+ "eval_samples_per_second": 45.801,
921
+ "eval_steps_per_second": 1.735,
922
+ "step": 195
923
+ },
924
+ {
925
+ "epoch": 13.066666666666666,
926
+ "grad_norm": 6.7704386711120605,
927
+ "learning_rate": 6.271604938271606e-05,
928
+ "loss": 0.5136,
929
+ "step": 196
930
+ },
931
+ {
932
+ "epoch": 13.2,
933
+ "grad_norm": 5.41668701171875,
934
+ "learning_rate": 6.222222222222222e-05,
935
+ "loss": 0.4843,
936
+ "step": 198
937
+ },
938
+ {
939
+ "epoch": 13.333333333333334,
940
+ "grad_norm": 4.7562150955200195,
941
+ "learning_rate": 6.17283950617284e-05,
942
+ "loss": 0.3338,
943
+ "step": 200
944
+ },
945
+ {
946
+ "epoch": 13.466666666666667,
947
+ "grad_norm": 4.077147960662842,
948
+ "learning_rate": 6.123456790123457e-05,
949
+ "loss": 0.2694,
950
+ "step": 202
951
+ },
952
+ {
953
+ "epoch": 13.6,
954
+ "grad_norm": 4.678223609924316,
955
+ "learning_rate": 6.074074074074074e-05,
956
+ "loss": 0.2965,
957
+ "step": 204
958
+ },
959
+ {
960
+ "epoch": 13.733333333333333,
961
+ "grad_norm": 6.246657371520996,
962
+ "learning_rate": 6.024691358024692e-05,
963
+ "loss": 0.489,
964
+ "step": 206
965
+ },
966
+ {
967
+ "epoch": 13.866666666666667,
968
+ "grad_norm": 4.0403971672058105,
969
+ "learning_rate": 5.975308641975309e-05,
970
+ "loss": 0.3524,
971
+ "step": 208
972
+ },
973
+ {
974
+ "epoch": 14.0,
975
+ "grad_norm": 11.723469734191895,
976
+ "learning_rate": 5.925925925925926e-05,
977
+ "loss": 0.6409,
978
+ "step": 210
979
+ },
980
+ {
981
+ "epoch": 14.0,
982
+ "eval_accuracy": 0.5984848484848485,
983
+ "eval_f1_macro": 0.5080833548412379,
984
+ "eval_f1_micro": 0.5984848484848485,
985
+ "eval_f1_weighted": 0.576454835403795,
986
+ "eval_loss": 1.4066194295883179,
987
+ "eval_precision_macro": 0.5193577256077255,
988
+ "eval_precision_micro": 0.5984848484848485,
989
+ "eval_precision_weighted": 0.5819911307127215,
990
+ "eval_recall_macro": 0.5231594860166289,
991
+ "eval_recall_micro": 0.5984848484848485,
992
+ "eval_recall_weighted": 0.5984848484848485,
993
+ "eval_runtime": 4.8101,
994
+ "eval_samples_per_second": 27.442,
995
+ "eval_steps_per_second": 1.039,
996
+ "step": 210
997
+ },
998
+ {
999
+ "epoch": 14.133333333333333,
1000
+ "grad_norm": 4.278630256652832,
1001
+ "learning_rate": 5.8765432098765437e-05,
1002
+ "loss": 0.1963,
1003
+ "step": 212
1004
+ },
1005
+ {
1006
+ "epoch": 14.266666666666667,
1007
+ "grad_norm": 5.803009510040283,
1008
+ "learning_rate": 5.8271604938271607e-05,
1009
+ "loss": 0.4284,
1010
+ "step": 214
1011
+ },
1012
+ {
1013
+ "epoch": 14.4,
1014
+ "grad_norm": 4.886916160583496,
1015
+ "learning_rate": 5.7777777777777776e-05,
1016
+ "loss": 0.3091,
1017
+ "step": 216
1018
+ },
1019
+ {
1020
+ "epoch": 14.533333333333333,
1021
+ "grad_norm": 6.119672775268555,
1022
+ "learning_rate": 5.728395061728395e-05,
1023
+ "loss": 0.3287,
1024
+ "step": 218
1025
+ },
1026
+ {
1027
+ "epoch": 14.666666666666666,
1028
+ "grad_norm": 7.14682149887085,
1029
+ "learning_rate": 5.679012345679012e-05,
1030
+ "loss": 0.2819,
1031
+ "step": 220
1032
+ },
1033
+ {
1034
+ "epoch": 14.8,
1035
+ "grad_norm": 5.075103282928467,
1036
+ "learning_rate": 5.62962962962963e-05,
1037
+ "loss": 0.2101,
1038
+ "step": 222
1039
+ },
1040
+ {
1041
+ "epoch": 14.933333333333334,
1042
+ "grad_norm": 4.5539045333862305,
1043
+ "learning_rate": 5.580246913580247e-05,
1044
+ "loss": 0.3206,
1045
+ "step": 224
1046
+ },
1047
+ {
1048
+ "epoch": 15.0,
1049
+ "eval_accuracy": 0.5606060606060606,
1050
+ "eval_f1_macro": 0.5154896879386676,
1051
+ "eval_f1_micro": 0.5606060606060606,
1052
+ "eval_f1_weighted": 0.5520090359376074,
1053
+ "eval_loss": 1.3689966201782227,
1054
+ "eval_precision_macro": 0.6158199643493761,
1055
+ "eval_precision_micro": 0.5606060606060606,
1056
+ "eval_precision_weighted": 0.5889932074758278,
1057
+ "eval_recall_macro": 0.5170219198790628,
1058
+ "eval_recall_micro": 0.5606060606060606,
1059
+ "eval_recall_weighted": 0.5606060606060606,
1060
+ "eval_runtime": 2.0949,
1061
+ "eval_samples_per_second": 63.009,
1062
+ "eval_steps_per_second": 2.387,
1063
+ "step": 225
1064
+ },
1065
+ {
1066
+ "epoch": 15.066666666666666,
1067
+ "grad_norm": 4.093947887420654,
1068
+ "learning_rate": 5.530864197530864e-05,
1069
+ "loss": 0.3352,
1070
+ "step": 226
1071
+ },
1072
+ {
1073
+ "epoch": 15.2,
1074
+ "grad_norm": 5.242745876312256,
1075
+ "learning_rate": 5.4814814814814817e-05,
1076
+ "loss": 0.2066,
1077
+ "step": 228
1078
+ },
1079
+ {
1080
+ "epoch": 15.333333333333334,
1081
+ "grad_norm": 5.613947868347168,
1082
+ "learning_rate": 5.4320987654320986e-05,
1083
+ "loss": 0.3504,
1084
+ "step": 230
1085
+ },
1086
+ {
1087
+ "epoch": 15.466666666666667,
1088
+ "grad_norm": 3.4319839477539062,
1089
+ "learning_rate": 5.382716049382717e-05,
1090
+ "loss": 0.2294,
1091
+ "step": 232
1092
+ },
1093
+ {
1094
+ "epoch": 15.6,
1095
+ "grad_norm": 6.01231575012207,
1096
+ "learning_rate": 5.333333333333333e-05,
1097
+ "loss": 0.2498,
1098
+ "step": 234
1099
+ },
1100
+ {
1101
+ "epoch": 15.733333333333333,
1102
+ "grad_norm": 3.9071357250213623,
1103
+ "learning_rate": 5.28395061728395e-05,
1104
+ "loss": 0.2092,
1105
+ "step": 236
1106
+ },
1107
+ {
1108
+ "epoch": 15.866666666666667,
1109
+ "grad_norm": 5.718769550323486,
1110
+ "learning_rate": 5.234567901234568e-05,
1111
+ "loss": 0.2223,
1112
+ "step": 238
1113
+ },
1114
+ {
1115
+ "epoch": 16.0,
1116
+ "grad_norm": 4.071746349334717,
1117
+ "learning_rate": 5.185185185185185e-05,
1118
+ "loss": 0.1773,
1119
+ "step": 240
1120
+ },
1121
+ {
1122
+ "epoch": 16.0,
1123
+ "eval_accuracy": 0.6515151515151515,
1124
+ "eval_f1_macro": 0.592019301793738,
1125
+ "eval_f1_micro": 0.6515151515151515,
1126
+ "eval_f1_weighted": 0.6407837434153223,
1127
+ "eval_loss": 1.2568100690841675,
1128
+ "eval_precision_macro": 0.6893528941196284,
1129
+ "eval_precision_micro": 0.6515151515151515,
1130
+ "eval_precision_weighted": 0.6623135907365115,
1131
+ "eval_recall_macro": 0.5842857142857143,
1132
+ "eval_recall_micro": 0.6515151515151515,
1133
+ "eval_recall_weighted": 0.6515151515151515,
1134
+ "eval_runtime": 1.9927,
1135
+ "eval_samples_per_second": 66.24,
1136
+ "eval_steps_per_second": 2.509,
1137
+ "step": 240
1138
+ },
1139
+ {
1140
+ "epoch": 16.133333333333333,
1141
+ "grad_norm": 3.561516761779785,
1142
+ "learning_rate": 5.135802469135803e-05,
1143
+ "loss": 0.1696,
1144
+ "step": 242
1145
+ },
1146
+ {
1147
+ "epoch": 16.266666666666666,
1148
+ "grad_norm": 1.3526779413223267,
1149
+ "learning_rate": 5.0864197530864197e-05,
1150
+ "loss": 0.0665,
1151
+ "step": 244
1152
+ },
1153
+ {
1154
+ "epoch": 16.4,
1155
+ "grad_norm": 4.29080057144165,
1156
+ "learning_rate": 5.0370370370370366e-05,
1157
+ "loss": 0.195,
1158
+ "step": 246
1159
+ },
1160
+ {
1161
+ "epoch": 16.533333333333335,
1162
+ "grad_norm": 6.229769706726074,
1163
+ "learning_rate": 4.987654320987655e-05,
1164
+ "loss": 0.2993,
1165
+ "step": 248
1166
+ },
1167
+ {
1168
+ "epoch": 16.666666666666668,
1169
+ "grad_norm": 4.949665546417236,
1170
+ "learning_rate": 4.938271604938271e-05,
1171
+ "loss": 0.2081,
1172
+ "step": 250
1173
+ },
1174
+ {
1175
+ "epoch": 16.8,
1176
+ "grad_norm": 6.123852252960205,
1177
+ "learning_rate": 4.888888888888889e-05,
1178
+ "loss": 0.212,
1179
+ "step": 252
1180
+ },
1181
+ {
1182
+ "epoch": 16.933333333333334,
1183
+ "grad_norm": 4.0239105224609375,
1184
+ "learning_rate": 4.8395061728395067e-05,
1185
+ "loss": 0.3259,
1186
+ "step": 254
1187
+ },
1188
+ {
1189
+ "epoch": 17.0,
1190
+ "eval_accuracy": 0.6060606060606061,
1191
+ "eval_f1_macro": 0.5467242234296787,
1192
+ "eval_f1_micro": 0.6060606060606061,
1193
+ "eval_f1_weighted": 0.5961390083174005,
1194
+ "eval_loss": 1.3405537605285645,
1195
+ "eval_precision_macro": 0.5614736217067472,
1196
+ "eval_precision_micro": 0.6060606060606061,
1197
+ "eval_precision_weighted": 0.6033042542530208,
1198
+ "eval_recall_macro": 0.5466817838246409,
1199
+ "eval_recall_micro": 0.6060606060606061,
1200
+ "eval_recall_weighted": 0.6060606060606061,
1201
+ "eval_runtime": 2.0502,
1202
+ "eval_samples_per_second": 64.382,
1203
+ "eval_steps_per_second": 2.439,
1204
+ "step": 255
1205
+ },
1206
+ {
1207
+ "epoch": 17.066666666666666,
1208
+ "grad_norm": 1.4321271181106567,
1209
+ "learning_rate": 4.7901234567901237e-05,
1210
+ "loss": 0.055,
1211
+ "step": 256
1212
+ },
1213
+ {
1214
+ "epoch": 17.2,
1215
+ "grad_norm": 2.13454008102417,
1216
+ "learning_rate": 4.740740740740741e-05,
1217
+ "loss": 0.1221,
1218
+ "step": 258
1219
+ },
1220
+ {
1221
+ "epoch": 17.333333333333332,
1222
+ "grad_norm": 5.276524066925049,
1223
+ "learning_rate": 4.691358024691358e-05,
1224
+ "loss": 0.1417,
1225
+ "step": 260
1226
+ },
1227
+ {
1228
+ "epoch": 17.466666666666665,
1229
+ "grad_norm": 3.8555052280426025,
1230
+ "learning_rate": 4.641975308641975e-05,
1231
+ "loss": 0.2943,
1232
+ "step": 262
1233
+ },
1234
+ {
1235
+ "epoch": 17.6,
1236
+ "grad_norm": 4.094534873962402,
1237
+ "learning_rate": 4.592592592592593e-05,
1238
+ "loss": 0.2206,
1239
+ "step": 264
1240
+ },
1241
+ {
1242
+ "epoch": 17.733333333333334,
1243
+ "grad_norm": 4.184159278869629,
1244
+ "learning_rate": 4.54320987654321e-05,
1245
+ "loss": 0.1565,
1246
+ "step": 266
1247
+ },
1248
+ {
1249
+ "epoch": 17.866666666666667,
1250
+ "grad_norm": 5.283144474029541,
1251
+ "learning_rate": 4.493827160493828e-05,
1252
+ "loss": 0.1427,
1253
+ "step": 268
1254
+ },
1255
+ {
1256
+ "epoch": 18.0,
1257
+ "grad_norm": 3.6470813751220703,
1258
+ "learning_rate": 4.4444444444444447e-05,
1259
+ "loss": 0.1123,
1260
+ "step": 270
1261
+ },
1262
+ {
1263
+ "epoch": 18.0,
1264
+ "eval_accuracy": 0.6363636363636364,
1265
+ "eval_f1_macro": 0.5867719657675725,
1266
+ "eval_f1_micro": 0.6363636363636364,
1267
+ "eval_f1_weighted": 0.6305501232595613,
1268
+ "eval_loss": 1.376707911491394,
1269
+ "eval_precision_macro": 0.6257631257631259,
1270
+ "eval_precision_micro": 0.6363636363636364,
1271
+ "eval_precision_weighted": 0.6413447663447664,
1272
+ "eval_recall_macro": 0.5785109599395314,
1273
+ "eval_recall_micro": 0.6363636363636364,
1274
+ "eval_recall_weighted": 0.6363636363636364,
1275
+ "eval_runtime": 1.992,
1276
+ "eval_samples_per_second": 66.266,
1277
+ "eval_steps_per_second": 2.51,
1278
+ "step": 270
1279
+ },
1280
+ {
1281
+ "epoch": 18.133333333333333,
1282
+ "grad_norm": 3.1710643768310547,
1283
+ "learning_rate": 4.3950617283950617e-05,
1284
+ "loss": 0.1219,
1285
+ "step": 272
1286
+ },
1287
+ {
1288
+ "epoch": 18.266666666666666,
1289
+ "grad_norm": 7.098196506500244,
1290
+ "learning_rate": 4.345679012345679e-05,
1291
+ "loss": 0.1588,
1292
+ "step": 274
1293
+ },
1294
+ {
1295
+ "epoch": 18.4,
1296
+ "grad_norm": 1.8567241430282593,
1297
+ "learning_rate": 4.296296296296296e-05,
1298
+ "loss": 0.1043,
1299
+ "step": 276
1300
+ },
1301
+ {
1302
+ "epoch": 18.533333333333335,
1303
+ "grad_norm": 2.1221156120300293,
1304
+ "learning_rate": 4.246913580246914e-05,
1305
+ "loss": 0.0748,
1306
+ "step": 278
1307
+ },
1308
+ {
1309
+ "epoch": 18.666666666666668,
1310
+ "grad_norm": 3.03196120262146,
1311
+ "learning_rate": 4.197530864197531e-05,
1312
+ "loss": 0.1148,
1313
+ "step": 280
1314
+ },
1315
+ {
1316
+ "epoch": 18.8,
1317
+ "grad_norm": 1.7942876815795898,
1318
+ "learning_rate": 4.148148148148148e-05,
1319
+ "loss": 0.0679,
1320
+ "step": 282
1321
+ },
1322
+ {
1323
+ "epoch": 18.933333333333334,
1324
+ "grad_norm": 4.499013900756836,
1325
+ "learning_rate": 4.0987654320987657e-05,
1326
+ "loss": 0.1129,
1327
+ "step": 284
1328
+ },
1329
+ {
1330
+ "epoch": 19.0,
1331
+ "eval_accuracy": 0.6439393939393939,
1332
+ "eval_f1_macro": 0.587916778045086,
1333
+ "eval_f1_micro": 0.6439393939393939,
1334
+ "eval_f1_weighted": 0.6305576751206262,
1335
+ "eval_loss": 1.4679865837097168,
1336
+ "eval_precision_macro": 0.6809288563910413,
1337
+ "eval_precision_micro": 0.6439393939393939,
1338
+ "eval_precision_weighted": 0.6932697872537444,
1339
+ "eval_recall_macro": 0.5806046863189721,
1340
+ "eval_recall_micro": 0.6439393939393939,
1341
+ "eval_recall_weighted": 0.6439393939393939,
1342
+ "eval_runtime": 1.9847,
1343
+ "eval_samples_per_second": 66.508,
1344
+ "eval_steps_per_second": 2.519,
1345
+ "step": 285
1346
+ },
1347
+ {
1348
+ "epoch": 19.066666666666666,
1349
+ "grad_norm": 2.631176233291626,
1350
+ "learning_rate": 4.049382716049383e-05,
1351
+ "loss": 0.1028,
1352
+ "step": 286
1353
+ },
1354
+ {
1355
+ "epoch": 19.2,
1356
+ "grad_norm": 4.930914402008057,
1357
+ "learning_rate": 4e-05,
1358
+ "loss": 0.2555,
1359
+ "step": 288
1360
+ },
1361
+ {
1362
+ "epoch": 19.333333333333332,
1363
+ "grad_norm": 3.355149745941162,
1364
+ "learning_rate": 3.950617283950617e-05,
1365
+ "loss": 0.0792,
1366
+ "step": 290
1367
+ },
1368
+ {
1369
+ "epoch": 19.466666666666665,
1370
+ "grad_norm": 2.2780933380126953,
1371
+ "learning_rate": 3.901234567901234e-05,
1372
+ "loss": 0.0595,
1373
+ "step": 292
1374
+ },
1375
+ {
1376
+ "epoch": 19.6,
1377
+ "grad_norm": 4.880768299102783,
1378
+ "learning_rate": 3.851851851851852e-05,
1379
+ "loss": 0.0756,
1380
+ "step": 294
1381
+ },
1382
+ {
1383
+ "epoch": 19.733333333333334,
1384
+ "grad_norm": 2.175165891647339,
1385
+ "learning_rate": 3.80246913580247e-05,
1386
+ "loss": 0.1077,
1387
+ "step": 296
1388
+ },
1389
+ {
1390
+ "epoch": 19.866666666666667,
1391
+ "grad_norm": 2.6557981967926025,
1392
+ "learning_rate": 3.7530864197530867e-05,
1393
+ "loss": 0.1094,
1394
+ "step": 298
1395
+ },
1396
+ {
1397
+ "epoch": 20.0,
1398
+ "grad_norm": 1.2508912086486816,
1399
+ "learning_rate": 3.7037037037037037e-05,
1400
+ "loss": 0.0651,
1401
+ "step": 300
1402
+ },
1403
+ {
1404
+ "epoch": 20.0,
1405
+ "eval_accuracy": 0.6893939393939394,
1406
+ "eval_f1_macro": 0.6655257312106627,
1407
+ "eval_f1_micro": 0.6893939393939394,
1408
+ "eval_f1_weighted": 0.687595503348928,
1409
+ "eval_loss": 1.4981398582458496,
1410
+ "eval_precision_macro": 0.7114991648833447,
1411
+ "eval_precision_micro": 0.6893939393939394,
1412
+ "eval_precision_weighted": 0.7224498247915767,
1413
+ "eval_recall_macro": 0.6510808767951625,
1414
+ "eval_recall_micro": 0.6893939393939394,
1415
+ "eval_recall_weighted": 0.6893939393939394,
1416
+ "eval_runtime": 1.9861,
1417
+ "eval_samples_per_second": 66.462,
1418
+ "eval_steps_per_second": 2.517,
1419
+ "step": 300
1420
+ },
1421
+ {
1422
+ "epoch": 20.133333333333333,
1423
+ "grad_norm": 5.263727188110352,
1424
+ "learning_rate": 3.654320987654321e-05,
1425
+ "loss": 0.075,
1426
+ "step": 302
1427
+ },
1428
+ {
1429
+ "epoch": 20.266666666666666,
1430
+ "grad_norm": 4.619281768798828,
1431
+ "learning_rate": 3.604938271604938e-05,
1432
+ "loss": 0.1319,
1433
+ "step": 304
1434
+ },
1435
+ {
1436
+ "epoch": 20.4,
1437
+ "grad_norm": 1.0995675325393677,
1438
+ "learning_rate": 3.555555555555556e-05,
1439
+ "loss": 0.0366,
1440
+ "step": 306
1441
+ },
1442
+ {
1443
+ "epoch": 20.533333333333335,
1444
+ "grad_norm": 4.2385663986206055,
1445
+ "learning_rate": 3.506172839506173e-05,
1446
+ "loss": 0.1331,
1447
+ "step": 308
1448
+ },
1449
+ {
1450
+ "epoch": 20.666666666666668,
1451
+ "grad_norm": 2.6913745403289795,
1452
+ "learning_rate": 3.45679012345679e-05,
1453
+ "loss": 0.0894,
1454
+ "step": 310
1455
+ },
1456
+ {
1457
+ "epoch": 20.8,
1458
+ "grad_norm": 4.785970687866211,
1459
+ "learning_rate": 3.4074074074074077e-05,
1460
+ "loss": 0.0756,
1461
+ "step": 312
1462
+ },
1463
+ {
1464
+ "epoch": 20.933333333333334,
1465
+ "grad_norm": 1.5702877044677734,
1466
+ "learning_rate": 3.3580246913580247e-05,
1467
+ "loss": 0.0685,
1468
+ "step": 314
1469
+ },
1470
+ {
1471
+ "epoch": 21.0,
1472
+ "eval_accuracy": 0.6515151515151515,
1473
+ "eval_f1_macro": 0.6091138915880551,
1474
+ "eval_f1_micro": 0.6515151515151515,
1475
+ "eval_f1_weighted": 0.6494256262321655,
1476
+ "eval_loss": 1.4620611667633057,
1477
+ "eval_precision_macro": 0.630280884283538,
1478
+ "eval_precision_micro": 0.6515151515151515,
1479
+ "eval_precision_weighted": 0.664075183502428,
1480
+ "eval_recall_macro": 0.6039682539682539,
1481
+ "eval_recall_micro": 0.6515151515151515,
1482
+ "eval_recall_weighted": 0.6515151515151515,
1483
+ "eval_runtime": 2.0276,
1484
+ "eval_samples_per_second": 65.103,
1485
+ "eval_steps_per_second": 2.466,
1486
+ "step": 315
1487
+ },
1488
+ {
1489
+ "epoch": 21.066666666666666,
1490
+ "grad_norm": 0.6037698984146118,
1491
+ "learning_rate": 3.308641975308642e-05,
1492
+ "loss": 0.0537,
1493
+ "step": 316
1494
+ },
1495
+ {
1496
+ "epoch": 21.2,
1497
+ "grad_norm": 0.877955436706543,
1498
+ "learning_rate": 3.25925925925926e-05,
1499
+ "loss": 0.0283,
1500
+ "step": 318
1501
+ },
1502
+ {
1503
+ "epoch": 21.333333333333332,
1504
+ "grad_norm": 4.185865879058838,
1505
+ "learning_rate": 3.209876543209876e-05,
1506
+ "loss": 0.1153,
1507
+ "step": 320
1508
+ },
1509
+ {
1510
+ "epoch": 21.466666666666665,
1511
+ "grad_norm": 0.7465834021568298,
1512
+ "learning_rate": 3.160493827160494e-05,
1513
+ "loss": 0.0311,
1514
+ "step": 322
1515
+ },
1516
+ {
1517
+ "epoch": 21.6,
1518
+ "grad_norm": 1.4049850702285767,
1519
+ "learning_rate": 3.111111111111111e-05,
1520
+ "loss": 0.0641,
1521
+ "step": 324
1522
+ },
1523
+ {
1524
+ "epoch": 21.733333333333334,
1525
+ "grad_norm": 1.6191234588623047,
1526
+ "learning_rate": 3.061728395061729e-05,
1527
+ "loss": 0.0417,
1528
+ "step": 326
1529
+ },
1530
+ {
1531
+ "epoch": 21.866666666666667,
1532
+ "grad_norm": 1.2088876962661743,
1533
+ "learning_rate": 3.012345679012346e-05,
1534
+ "loss": 0.0314,
1535
+ "step": 328
1536
+ },
1537
+ {
1538
+ "epoch": 22.0,
1539
+ "grad_norm": 0.7652052640914917,
1540
+ "learning_rate": 2.962962962962963e-05,
1541
+ "loss": 0.1469,
1542
+ "step": 330
1543
+ },
1544
+ {
1545
+ "epoch": 22.0,
1546
+ "eval_accuracy": 0.6212121212121212,
1547
+ "eval_f1_macro": 0.5330299221627766,
1548
+ "eval_f1_micro": 0.6212121212121212,
1549
+ "eval_f1_weighted": 0.604041002442862,
1550
+ "eval_loss": 1.534732699394226,
1551
+ "eval_precision_macro": 0.5476940619507992,
1552
+ "eval_precision_micro": 0.6212121212121212,
1553
+ "eval_precision_weighted": 0.6148931558944467,
1554
+ "eval_recall_macro": 0.5439984882842026,
1555
+ "eval_recall_micro": 0.6212121212121212,
1556
+ "eval_recall_weighted": 0.6212121212121212,
1557
+ "eval_runtime": 1.9747,
1558
+ "eval_samples_per_second": 66.844,
1559
+ "eval_steps_per_second": 2.532,
1560
+ "step": 330
1561
+ },
1562
+ {
1563
+ "epoch": 22.133333333333333,
1564
+ "grad_norm": 3.304185152053833,
1565
+ "learning_rate": 2.9135802469135803e-05,
1566
+ "loss": 0.0456,
1567
+ "step": 332
1568
+ },
1569
+ {
1570
+ "epoch": 22.266666666666666,
1571
+ "grad_norm": 2.3118255138397217,
1572
+ "learning_rate": 2.8641975308641977e-05,
1573
+ "loss": 0.0377,
1574
+ "step": 334
1575
+ },
1576
+ {
1577
+ "epoch": 22.4,
1578
+ "grad_norm": 2.3639698028564453,
1579
+ "learning_rate": 2.814814814814815e-05,
1580
+ "loss": 0.0708,
1581
+ "step": 336
1582
+ },
1583
+ {
1584
+ "epoch": 22.533333333333335,
1585
+ "grad_norm": 1.741746187210083,
1586
+ "learning_rate": 2.765432098765432e-05,
1587
+ "loss": 0.0353,
1588
+ "step": 338
1589
+ },
1590
+ {
1591
+ "epoch": 22.666666666666668,
1592
+ "grad_norm": 0.6108101010322571,
1593
+ "learning_rate": 2.7160493827160493e-05,
1594
+ "loss": 0.0531,
1595
+ "step": 340
1596
+ },
1597
+ {
1598
+ "epoch": 22.8,
1599
+ "grad_norm": 2.961045503616333,
1600
+ "learning_rate": 2.6666666666666667e-05,
1601
+ "loss": 0.0394,
1602
+ "step": 342
1603
+ },
1604
+ {
1605
+ "epoch": 22.933333333333334,
1606
+ "grad_norm": 1.7298003435134888,
1607
+ "learning_rate": 2.617283950617284e-05,
1608
+ "loss": 0.0289,
1609
+ "step": 344
1610
+ },
1611
+ {
1612
+ "epoch": 23.0,
1613
+ "eval_accuracy": 0.6287878787878788,
1614
+ "eval_f1_macro": 0.5465784005632545,
1615
+ "eval_f1_micro": 0.6287878787878788,
1616
+ "eval_f1_weighted": 0.6179920372130975,
1617
+ "eval_loss": 1.5417176485061646,
1618
+ "eval_precision_macro": 0.5409361471861472,
1619
+ "eval_precision_micro": 0.6287878787878788,
1620
+ "eval_precision_weighted": 0.610816976584022,
1621
+ "eval_recall_macro": 0.5549206349206349,
1622
+ "eval_recall_micro": 0.6287878787878788,
1623
+ "eval_recall_weighted": 0.6287878787878788,
1624
+ "eval_runtime": 1.9902,
1625
+ "eval_samples_per_second": 66.326,
1626
+ "eval_steps_per_second": 2.512,
1627
+ "step": 345
1628
+ },
1629
+ {
1630
+ "epoch": 23.066666666666666,
1631
+ "grad_norm": 0.7690654397010803,
1632
+ "learning_rate": 2.5679012345679017e-05,
1633
+ "loss": 0.0458,
1634
+ "step": 346
1635
+ },
1636
+ {
1637
+ "epoch": 23.2,
1638
+ "grad_norm": 3.320651054382324,
1639
+ "learning_rate": 2.5185185185185183e-05,
1640
+ "loss": 0.0804,
1641
+ "step": 348
1642
+ },
1643
+ {
1644
+ "epoch": 23.333333333333332,
1645
+ "grad_norm": 2.0301012992858887,
1646
+ "learning_rate": 2.4691358024691357e-05,
1647
+ "loss": 0.0279,
1648
+ "step": 350
1649
+ },
1650
+ {
1651
+ "epoch": 23.466666666666665,
1652
+ "grad_norm": 0.4531901478767395,
1653
+ "learning_rate": 2.4197530864197533e-05,
1654
+ "loss": 0.0139,
1655
+ "step": 352
1656
+ },
1657
+ {
1658
+ "epoch": 23.6,
1659
+ "grad_norm": 2.56703519821167,
1660
+ "learning_rate": 2.3703703703703707e-05,
1661
+ "loss": 0.0783,
1662
+ "step": 354
1663
+ },
1664
+ {
1665
+ "epoch": 23.733333333333334,
1666
+ "grad_norm": 0.20635652542114258,
1667
+ "learning_rate": 2.3209876543209877e-05,
1668
+ "loss": 0.012,
1669
+ "step": 356
1670
+ },
1671
+ {
1672
+ "epoch": 23.866666666666667,
1673
+ "grad_norm": 0.5930025577545166,
1674
+ "learning_rate": 2.271604938271605e-05,
1675
+ "loss": 0.0145,
1676
+ "step": 358
1677
+ },
1678
+ {
1679
+ "epoch": 24.0,
1680
+ "grad_norm": 0.24041523039340973,
1681
+ "learning_rate": 2.2222222222222223e-05,
1682
+ "loss": 0.01,
1683
+ "step": 360
1684
+ },
1685
+ {
1686
+ "epoch": 24.0,
1687
+ "eval_accuracy": 0.6363636363636364,
1688
+ "eval_f1_macro": 0.5474889044983636,
1689
+ "eval_f1_micro": 0.6363636363636364,
1690
+ "eval_f1_weighted": 0.6187343775995573,
1691
+ "eval_loss": 1.5670151710510254,
1692
+ "eval_precision_macro": 0.5434552419168567,
1693
+ "eval_precision_micro": 0.6363636363636364,
1694
+ "eval_precision_weighted": 0.6103857259761386,
1695
+ "eval_recall_macro": 0.5594179894179894,
1696
+ "eval_recall_micro": 0.6363636363636364,
1697
+ "eval_recall_weighted": 0.6363636363636364,
1698
+ "eval_runtime": 1.9405,
1699
+ "eval_samples_per_second": 68.023,
1700
+ "eval_steps_per_second": 2.577,
1701
+ "step": 360
1702
+ },
1703
+ {
1704
+ "epoch": 24.133333333333333,
1705
+ "grad_norm": 0.21558411419391632,
1706
+ "learning_rate": 2.1728395061728397e-05,
1707
+ "loss": 0.0102,
1708
+ "step": 362
1709
+ },
1710
+ {
1711
+ "epoch": 24.266666666666666,
1712
+ "grad_norm": 3.2394814491271973,
1713
+ "learning_rate": 2.123456790123457e-05,
1714
+ "loss": 0.0218,
1715
+ "step": 364
1716
+ },
1717
+ {
1718
+ "epoch": 24.4,
1719
+ "grad_norm": 3.6115405559539795,
1720
+ "learning_rate": 2.074074074074074e-05,
1721
+ "loss": 0.1149,
1722
+ "step": 366
1723
+ },
1724
+ {
1725
+ "epoch": 24.533333333333335,
1726
+ "grad_norm": 0.1589735597372055,
1727
+ "learning_rate": 2.0246913580246917e-05,
1728
+ "loss": 0.0082,
1729
+ "step": 368
1730
+ },
1731
+ {
1732
+ "epoch": 24.666666666666668,
1733
+ "grad_norm": 1.3840848207473755,
1734
+ "learning_rate": 1.9753086419753087e-05,
1735
+ "loss": 0.0174,
1736
+ "step": 370
1737
+ },
1738
+ {
1739
+ "epoch": 24.8,
1740
+ "grad_norm": 3.772754192352295,
1741
+ "learning_rate": 1.925925925925926e-05,
1742
+ "loss": 0.043,
1743
+ "step": 372
1744
+ },
1745
+ {
1746
+ "epoch": 24.933333333333334,
1747
+ "grad_norm": 0.41601723432540894,
1748
+ "learning_rate": 1.8765432098765433e-05,
1749
+ "loss": 0.035,
1750
+ "step": 374
1751
+ },
1752
+ {
1753
+ "epoch": 25.0,
1754
+ "eval_accuracy": 0.6363636363636364,
1755
+ "eval_f1_macro": 0.5529395694676043,
1756
+ "eval_f1_micro": 0.6363636363636364,
1757
+ "eval_f1_weighted": 0.6209326623035122,
1758
+ "eval_loss": 1.6037245988845825,
1759
+ "eval_precision_macro": 0.5470247238680418,
1760
+ "eval_precision_micro": 0.6363636363636364,
1761
+ "eval_precision_weighted": 0.6156263091746962,
1762
+ "eval_recall_macro": 0.5679213907785335,
1763
+ "eval_recall_micro": 0.6363636363636364,
1764
+ "eval_recall_weighted": 0.6363636363636364,
1765
+ "eval_runtime": 1.9551,
1766
+ "eval_samples_per_second": 67.515,
1767
+ "eval_steps_per_second": 2.557,
1768
+ "step": 375
1769
+ },
1770
+ {
1771
+ "epoch": 25.066666666666666,
1772
+ "grad_norm": 0.4554837942123413,
1773
+ "learning_rate": 1.8271604938271607e-05,
1774
+ "loss": 0.0636,
1775
+ "step": 376
1776
+ },
1777
+ {
1778
+ "epoch": 25.2,
1779
+ "grad_norm": 4.667645454406738,
1780
+ "learning_rate": 1.777777777777778e-05,
1781
+ "loss": 0.0685,
1782
+ "step": 378
1783
+ },
1784
+ {
1785
+ "epoch": 25.333333333333332,
1786
+ "grad_norm": 6.68064022064209,
1787
+ "learning_rate": 1.728395061728395e-05,
1788
+ "loss": 0.0919,
1789
+ "step": 380
1790
+ },
1791
+ {
1792
+ "epoch": 25.466666666666665,
1793
+ "grad_norm": 0.2510056793689728,
1794
+ "learning_rate": 1.6790123456790123e-05,
1795
+ "loss": 0.0111,
1796
+ "step": 382
1797
+ },
1798
+ {
1799
+ "epoch": 25.6,
1800
+ "grad_norm": 0.6245520114898682,
1801
+ "learning_rate": 1.62962962962963e-05,
1802
+ "loss": 0.0134,
1803
+ "step": 384
1804
+ },
1805
+ {
1806
+ "epoch": 25.733333333333334,
1807
+ "grad_norm": 2.165201187133789,
1808
+ "learning_rate": 1.580246913580247e-05,
1809
+ "loss": 0.0271,
1810
+ "step": 386
1811
+ },
1812
+ {
1813
+ "epoch": 25.866666666666667,
1814
+ "grad_norm": 0.24112091958522797,
1815
+ "learning_rate": 1.5308641975308643e-05,
1816
+ "loss": 0.0105,
1817
+ "step": 388
1818
+ },
1819
+ {
1820
+ "epoch": 26.0,
1821
+ "grad_norm": 0.377363383769989,
1822
+ "learning_rate": 1.4814814814814815e-05,
1823
+ "loss": 0.0109,
1824
+ "step": 390
1825
+ },
1826
+ {
1827
+ "epoch": 26.0,
1828
+ "eval_accuracy": 0.6212121212121212,
1829
+ "eval_f1_macro": 0.5896814040471776,
1830
+ "eval_f1_micro": 0.6212121212121212,
1831
+ "eval_f1_weighted": 0.6203213160225189,
1832
+ "eval_loss": 1.6751586198806763,
1833
+ "eval_precision_macro": 0.6144605795534588,
1834
+ "eval_precision_micro": 0.6212121212121212,
1835
+ "eval_precision_weighted": 0.6527441598649029,
1836
+ "eval_recall_macro": 0.5999697656840514,
1837
+ "eval_recall_micro": 0.6212121212121212,
1838
+ "eval_recall_weighted": 0.6212121212121212,
1839
+ "eval_runtime": 1.9656,
1840
+ "eval_samples_per_second": 67.154,
1841
+ "eval_steps_per_second": 2.544,
1842
+ "step": 390
1843
+ },
1844
+ {
1845
+ "epoch": 26.133333333333333,
1846
+ "grad_norm": 0.3774866461753845,
1847
+ "learning_rate": 1.4320987654320988e-05,
1848
+ "loss": 0.0097,
1849
+ "step": 392
1850
+ },
1851
+ {
1852
+ "epoch": 26.266666666666666,
1853
+ "grad_norm": 3.956695079803467,
1854
+ "learning_rate": 1.382716049382716e-05,
1855
+ "loss": 0.0233,
1856
+ "step": 394
1857
+ },
1858
+ {
1859
+ "epoch": 26.4,
1860
+ "grad_norm": 0.5877533555030823,
1861
+ "learning_rate": 1.3333333333333333e-05,
1862
+ "loss": 0.0157,
1863
+ "step": 396
1864
+ },
1865
+ {
1866
+ "epoch": 26.533333333333335,
1867
+ "grad_norm": 1.2962318658828735,
1868
+ "learning_rate": 1.2839506172839508e-05,
1869
+ "loss": 0.0249,
1870
+ "step": 398
1871
+ },
1872
+ {
1873
+ "epoch": 26.666666666666668,
1874
+ "grad_norm": 2.2431485652923584,
1875
+ "learning_rate": 1.2345679012345678e-05,
1876
+ "loss": 0.0224,
1877
+ "step": 400
1878
+ },
1879
+ {
1880
+ "epoch": 26.8,
1881
+ "grad_norm": 0.21492817997932434,
1882
+ "learning_rate": 1.1851851851851853e-05,
1883
+ "loss": 0.0117,
1884
+ "step": 402
1885
+ },
1886
+ {
1887
+ "epoch": 26.933333333333334,
1888
+ "grad_norm": 0.4237399697303772,
1889
+ "learning_rate": 1.1358024691358025e-05,
1890
+ "loss": 0.038,
1891
+ "step": 404
1892
+ },
1893
+ {
1894
+ "epoch": 27.0,
1895
+ "eval_accuracy": 0.6136363636363636,
1896
+ "eval_f1_macro": 0.5343822919199936,
1897
+ "eval_f1_micro": 0.6136363636363636,
1898
+ "eval_f1_weighted": 0.6008425380028616,
1899
+ "eval_loss": 1.672375202178955,
1900
+ "eval_precision_macro": 0.5331553830282576,
1901
+ "eval_precision_micro": 0.6136363636363636,
1902
+ "eval_precision_weighted": 0.6004965634415023,
1903
+ "eval_recall_macro": 0.546832955404384,
1904
+ "eval_recall_micro": 0.6136363636363636,
1905
+ "eval_recall_weighted": 0.6136363636363636,
1906
+ "eval_runtime": 1.9801,
1907
+ "eval_samples_per_second": 66.662,
1908
+ "eval_steps_per_second": 2.525,
1909
+ "step": 405
1910
+ },
1911
+ {
1912
+ "epoch": 27.066666666666666,
1913
+ "grad_norm": 1.5725435018539429,
1914
+ "learning_rate": 1.0864197530864198e-05,
1915
+ "loss": 0.0149,
1916
+ "step": 406
1917
+ },
1918
+ {
1919
+ "epoch": 27.2,
1920
+ "grad_norm": 0.13784648478031158,
1921
+ "learning_rate": 1.037037037037037e-05,
1922
+ "loss": 0.0092,
1923
+ "step": 408
1924
+ },
1925
+ {
1926
+ "epoch": 27.333333333333332,
1927
+ "grad_norm": 0.09840863198041916,
1928
+ "learning_rate": 9.876543209876543e-06,
1929
+ "loss": 0.008,
1930
+ "step": 410
1931
+ },
1932
+ {
1933
+ "epoch": 27.466666666666665,
1934
+ "grad_norm": 0.8349915146827698,
1935
+ "learning_rate": 9.382716049382717e-06,
1936
+ "loss": 0.0206,
1937
+ "step": 412
1938
+ },
1939
+ {
1940
+ "epoch": 27.6,
1941
+ "grad_norm": 0.33149102330207825,
1942
+ "learning_rate": 8.88888888888889e-06,
1943
+ "loss": 0.0173,
1944
+ "step": 414
1945
+ },
1946
+ {
1947
+ "epoch": 27.733333333333334,
1948
+ "grad_norm": 0.3867279589176178,
1949
+ "learning_rate": 8.395061728395062e-06,
1950
+ "loss": 0.0093,
1951
+ "step": 416
1952
+ },
1953
+ {
1954
+ "epoch": 27.866666666666667,
1955
+ "grad_norm": 1.726897120475769,
1956
+ "learning_rate": 7.901234567901235e-06,
1957
+ "loss": 0.0214,
1958
+ "step": 418
1959
+ },
1960
+ {
1961
+ "epoch": 28.0,
1962
+ "grad_norm": 0.19306233525276184,
1963
+ "learning_rate": 7.4074074074074075e-06,
1964
+ "loss": 0.0116,
1965
+ "step": 420
1966
+ },
1967
+ {
1968
+ "epoch": 28.0,
1969
+ "eval_accuracy": 0.6212121212121212,
1970
+ "eval_f1_macro": 0.5383730158730159,
1971
+ "eval_f1_micro": 0.6212121212121212,
1972
+ "eval_f1_weighted": 0.609029280904281,
1973
+ "eval_loss": 1.6251877546310425,
1974
+ "eval_precision_macro": 0.533696432596027,
1975
+ "eval_precision_micro": 0.6212121212121212,
1976
+ "eval_precision_weighted": 0.6033010121498966,
1977
+ "eval_recall_macro": 0.5490778533635676,
1978
+ "eval_recall_micro": 0.6212121212121212,
1979
+ "eval_recall_weighted": 0.6212121212121212,
1980
+ "eval_runtime": 1.9285,
1981
+ "eval_samples_per_second": 68.448,
1982
+ "eval_steps_per_second": 2.593,
1983
+ "step": 420
1984
+ },
1985
+ {
1986
+ "epoch": 28.133333333333333,
1987
+ "grad_norm": 1.669783115386963,
1988
+ "learning_rate": 6.91358024691358e-06,
1989
+ "loss": 0.0318,
1990
+ "step": 422
1991
+ },
1992
+ {
1993
+ "epoch": 28.266666666666666,
1994
+ "grad_norm": 0.6250646114349365,
1995
+ "learning_rate": 6.419753086419754e-06,
1996
+ "loss": 0.0195,
1997
+ "step": 424
1998
+ },
1999
+ {
2000
+ "epoch": 28.4,
2001
+ "grad_norm": 0.4752732813358307,
2002
+ "learning_rate": 5.925925925925927e-06,
2003
+ "loss": 0.0124,
2004
+ "step": 426
2005
+ },
2006
+ {
2007
+ "epoch": 28.533333333333335,
2008
+ "grad_norm": 0.16341274976730347,
2009
+ "learning_rate": 5.432098765432099e-06,
2010
+ "loss": 0.0065,
2011
+ "step": 428
2012
+ },
2013
+ {
2014
+ "epoch": 28.666666666666668,
2015
+ "grad_norm": 0.08904340863227844,
2016
+ "learning_rate": 4.938271604938272e-06,
2017
+ "loss": 0.0062,
2018
+ "step": 430
2019
+ },
2020
+ {
2021
+ "epoch": 28.8,
2022
+ "grad_norm": 0.24332502484321594,
2023
+ "learning_rate": 4.444444444444445e-06,
2024
+ "loss": 0.0055,
2025
+ "step": 432
2026
+ },
2027
+ {
2028
+ "epoch": 28.933333333333334,
2029
+ "grad_norm": 0.47205692529678345,
2030
+ "learning_rate": 3.9506172839506175e-06,
2031
+ "loss": 0.006,
2032
+ "step": 434
2033
+ },
2034
+ {
2035
+ "epoch": 29.0,
2036
+ "eval_accuracy": 0.6363636363636364,
2037
+ "eval_f1_macro": 0.557191887992969,
2038
+ "eval_f1_micro": 0.6363636363636364,
2039
+ "eval_f1_weighted": 0.6294141170899599,
2040
+ "eval_loss": 1.597952961921692,
2041
+ "eval_precision_macro": 0.5529214559386972,
2042
+ "eval_precision_micro": 0.6363636363636364,
2043
+ "eval_precision_weighted": 0.6245954516428655,
2044
+ "eval_recall_macro": 0.563363567649282,
2045
+ "eval_recall_micro": 0.6363636363636364,
2046
+ "eval_recall_weighted": 0.6363636363636364,
2047
+ "eval_runtime": 1.9083,
2048
+ "eval_samples_per_second": 69.172,
2049
+ "eval_steps_per_second": 2.62,
2050
+ "step": 435
2051
+ },
2052
+ {
2053
+ "epoch": 29.066666666666666,
2054
+ "grad_norm": 0.27642032504081726,
2055
+ "learning_rate": 3.45679012345679e-06,
2056
+ "loss": 0.0162,
2057
+ "step": 436
2058
+ },
2059
+ {
2060
+ "epoch": 29.2,
2061
+ "grad_norm": 0.9449041485786438,
2062
+ "learning_rate": 2.9629629629629633e-06,
2063
+ "loss": 0.0088,
2064
+ "step": 438
2065
+ },
2066
+ {
2067
+ "epoch": 29.333333333333332,
2068
+ "grad_norm": 0.14337310194969177,
2069
+ "learning_rate": 2.469135802469136e-06,
2070
+ "loss": 0.0193,
2071
+ "step": 440
2072
+ },
2073
+ {
2074
+ "epoch": 29.466666666666665,
2075
+ "grad_norm": 0.17881515622138977,
2076
+ "learning_rate": 1.9753086419753087e-06,
2077
+ "loss": 0.0191,
2078
+ "step": 442
2079
+ },
2080
+ {
2081
+ "epoch": 29.6,
2082
+ "grad_norm": 0.15386801958084106,
2083
+ "learning_rate": 1.4814814814814817e-06,
2084
+ "loss": 0.005,
2085
+ "step": 444
2086
+ },
2087
+ {
2088
+ "epoch": 29.733333333333334,
2089
+ "grad_norm": 0.32567164301872253,
2090
+ "learning_rate": 9.876543209876544e-07,
2091
+ "loss": 0.0077,
2092
+ "step": 446
2093
+ },
2094
+ {
2095
+ "epoch": 29.866666666666667,
2096
+ "grad_norm": 0.6249086260795593,
2097
+ "learning_rate": 4.938271604938272e-07,
2098
+ "loss": 0.0158,
2099
+ "step": 448
2100
+ },
2101
+ {
2102
+ "epoch": 30.0,
2103
+ "grad_norm": 0.24103443324565887,
2104
+ "learning_rate": 0.0,
2105
+ "loss": 0.0046,
2106
+ "step": 450
2107
+ },
2108
+ {
2109
+ "epoch": 30.0,
2110
+ "eval_accuracy": 0.6439393939393939,
2111
+ "eval_f1_macro": 0.5605037390491809,
2112
+ "eval_f1_micro": 0.6439393939393939,
2113
+ "eval_f1_weighted": 0.634156085647718,
2114
+ "eval_loss": 1.593876838684082,
2115
+ "eval_precision_macro": 0.5545634920634921,
2116
+ "eval_precision_micro": 0.6439393939393939,
2117
+ "eval_precision_weighted": 0.6269465488215488,
2118
+ "eval_recall_macro": 0.5686545729402873,
2119
+ "eval_recall_micro": 0.6439393939393939,
2120
+ "eval_recall_weighted": 0.6439393939393939,
2121
+ "eval_runtime": 2.0538,
2122
+ "eval_samples_per_second": 64.27,
2123
+ "eval_steps_per_second": 2.434,
2124
+ "step": 450
2125
+ },
2126
+ {
2127
+ "epoch": 30.0,
2128
+ "step": 450,
2129
+ "total_flos": 1.0740871074163507e+18,
2130
+ "train_loss": 0.6009381743893027,
2131
+ "train_runtime": 318.0836,
2132
+ "train_samples_per_second": 43.573,
2133
+ "train_steps_per_second": 1.415
2134
+ }
2135
+ ],
2136
+ "logging_steps": 2,
2137
+ "max_steps": 450,
2138
+ "num_input_tokens_seen": 0,
2139
+ "num_train_epochs": 30,
2140
+ "save_steps": 500,
2141
+ "stateful_callbacks": {
2142
+ "TrainerControl": {
2143
+ "args": {
2144
+ "should_epoch_stop": false,
2145
+ "should_evaluate": false,
2146
+ "should_log": false,
2147
+ "should_save": true,
2148
+ "should_training_stop": true
2149
+ },
2150
+ "attributes": {}
2151
+ }
2152
+ },
2153
+ "total_flos": 1.0740871074163507e+18,
2154
+ "train_batch_size": 32,
2155
+ "trial_name": null,
2156
+ "trial_params": null
2157
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:910c7e0f885cf1c63f065cd16c8f164a2f028db680bd03a0f4461e25e8be9754
3
+ size 5368