corranm commited on
Commit
7801e83
·
verified ·
1 Parent(s): 6c8e0c0

End of training

Browse files
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
- base_model: google/vit-base-patch16-224-in21k
5
  tags:
6
  - generated_from_trainer
7
  metrics:
@@ -16,19 +16,19 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # squarerun_earlystop
18
 
19
- This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 1.3037
22
- - F1 Macro: 0.4518
23
- - F1 Micro: 0.5303
24
- - F1 Weighted: 0.4968
25
- - Precision Macro: 0.4451
26
- - Precision Micro: 0.5303
27
- - Precision Weighted: 0.4972
28
- - Recall Macro: 0.4879
29
- - Recall Micro: 0.5303
30
- - Recall Weighted: 0.5303
31
- - Accuracy: 0.5303
32
 
33
  ## Model description
34
 
@@ -62,25 +62,24 @@ The following hyperparameters were used during training:
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | F1 Macro | F1 Micro | F1 Weighted | Precision Macro | Precision Micro | Precision Weighted | Recall Macro | Recall Micro | Recall Weighted | Accuracy |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|:-----------:|:---------------:|:---------------:|:------------------:|:------------:|:------------:|:---------------:|:--------:|
65
- | 1.9079 | 1.0 | 29 | 1.8821 | 0.0791 | 0.1894 | 0.1087 | 0.0565 | 0.1894 | 0.0775 | 0.1369 | 0.1894 | 0.1894 | 0.1894 |
66
- | 1.8597 | 2.0 | 58 | 1.8469 | 0.1862 | 0.2879 | 0.2344 | 0.1884 | 0.2879 | 0.2345 | 0.2281 | 0.2879 | 0.2879 | 0.2879 |
67
- | 1.9027 | 3.0 | 87 | 1.7611 | 0.1729 | 0.3106 | 0.2221 | 0.1526 | 0.3106 | 0.1941 | 0.2359 | 0.3106 | 0.3106 | 0.3106 |
68
- | 1.5401 | 4.0 | 116 | 1.6302 | 0.2466 | 0.3788 | 0.3014 | 0.2602 | 0.3788 | 0.3007 | 0.2993 | 0.3788 | 0.3788 | 0.3788 |
69
- | 1.4 | 5.0 | 145 | 1.5180 | 0.2897 | 0.3864 | 0.3245 | 0.4033 | 0.3864 | 0.4498 | 0.3426 | 0.3864 | 0.3864 | 0.3864 |
70
- | 1.6258 | 6.0 | 174 | 1.4915 | 0.3342 | 0.4545 | 0.3925 | 0.4129 | 0.4545 | 0.4454 | 0.3678 | 0.4545 | 0.4545 | 0.4545 |
71
- | 1.3579 | 7.0 | 203 | 1.3221 | 0.4270 | 0.5303 | 0.4918 | 0.4623 | 0.5303 | 0.5269 | 0.4614 | 0.5303 | 0.5303 | 0.5303 |
72
- | 1.236 | 8.0 | 232 | 1.3025 | 0.4404 | 0.5303 | 0.5073 | 0.4672 | 0.5303 | 0.5370 | 0.4579 | 0.5303 | 0.5303 | 0.5303 |
73
- | 0.7554 | 9.0 | 261 | 1.2687 | 0.4409 | 0.5227 | 0.5085 | 0.4555 | 0.5227 | 0.5196 | 0.4547 | 0.5227 | 0.5227 | 0.5227 |
74
- | 1.2034 | 10.0 | 290 | 1.3682 | 0.4223 | 0.5152 | 0.4898 | 0.4735 | 0.5152 | 0.5571 | 0.4478 | 0.5152 | 0.5152 | 0.5152 |
75
- | 0.8037 | 11.0 | 319 | 1.1536 | 0.5364 | 0.6212 | 0.6067 | 0.5305 | 0.6212 | 0.6093 | 0.5558 | 0.6212 | 0.6212 | 0.6212 |
76
- | 0.593 | 12.0 | 348 | 1.4809 | 0.4485 | 0.4924 | 0.5033 | 0.5444 | 0.4924 | 0.6405 | 0.4709 | 0.4924 | 0.4924 | 0.4924 |
77
- | 0.732 | 13.0 | 377 | 1.1918 | 0.5145 | 0.5909 | 0.5809 | 0.5642 | 0.5909 | 0.6448 | 0.5288 | 0.5909 | 0.5909 | 0.5909 |
78
- | 0.8642 | 14.0 | 406 | 1.1884 | 0.5114 | 0.5985 | 0.5884 | 0.5284 | 0.5985 | 0.6108 | 0.5244 | 0.5985 | 0.5985 | 0.5985 |
79
 
80
 
81
  ### Framework versions
82
 
83
- - Transformers 4.48.1
84
- - Pytorch 2.5.1+cu124
85
  - Datasets 3.2.0
86
  - Tokenizers 0.21.0
 
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
+ base_model: google/vit-large-patch16-224
5
  tags:
6
  - generated_from_trainer
7
  metrics:
 
16
 
17
  # squarerun_earlystop
18
 
19
+ This model is a fine-tuned version of [google/vit-large-patch16-224](https://huggingface.co/google/vit-large-patch16-224) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1.2750
22
+ - F1 Macro: 0.4568
23
+ - F1 Micro: 0.5455
24
+ - F1 Weighted: 0.5111
25
+ - Precision Macro: 0.4686
26
+ - Precision Micro: 0.5455
27
+ - Precision Weighted: 0.5173
28
+ - Recall Macro: 0.4845
29
+ - Recall Micro: 0.5455
30
+ - Recall Weighted: 0.5455
31
+ - Accuracy: 0.5455
32
 
33
  ## Model description
34
 
 
62
 
63
  | Training Loss | Epoch | Step | Validation Loss | F1 Macro | F1 Micro | F1 Weighted | Precision Macro | Precision Micro | Precision Weighted | Recall Macro | Recall Micro | Recall Weighted | Accuracy |
64
  |:-------------:|:-----:|:----:|:---------------:|:--------:|:--------:|:-----------:|:---------------:|:---------------:|:------------------:|:------------:|:------------:|:---------------:|:--------:|
65
+ | 1.9437 | 1.0 | 29 | 1.8987 | 0.1485 | 0.2576 | 0.1680 | 0.1192 | 0.2576 | 0.1321 | 0.2207 | 0.2576 | 0.2576 | 0.2576 |
66
+ | 1.4616 | 2.0 | 58 | 1.5844 | 0.3569 | 0.4242 | 0.4076 | 0.4336 | 0.4242 | 0.4738 | 0.3657 | 0.4242 | 0.4242 | 0.4242 |
67
+ | 1.9935 | 3.0 | 87 | 1.4952 | 0.3059 | 0.4242 | 0.3585 | 0.3795 | 0.4242 | 0.4097 | 0.3387 | 0.4242 | 0.4242 | 0.4242 |
68
+ | 1.3601 | 4.0 | 116 | 1.4319 | 0.3275 | 0.4167 | 0.3720 | 0.3223 | 0.4167 | 0.3618 | 0.3614 | 0.4167 | 0.4167 | 0.4167 |
69
+ | 1.1685 | 5.0 | 145 | 1.1508 | 0.4913 | 0.5833 | 0.5550 | 0.4887 | 0.5833 | 0.5484 | 0.5139 | 0.5833 | 0.5833 | 0.5833 |
70
+ | 1.2228 | 6.0 | 174 | 1.2663 | 0.4865 | 0.5076 | 0.5046 | 0.5339 | 0.5076 | 0.5644 | 0.4964 | 0.5076 | 0.5076 | 0.5076 |
71
+ | 1.2811 | 7.0 | 203 | 1.4596 | 0.4084 | 0.5303 | 0.4752 | 0.5582 | 0.5303 | 0.6068 | 0.4383 | 0.5303 | 0.5303 | 0.5303 |
72
+ | 1.7256 | 8.0 | 232 | 1.4908 | 0.4805 | 0.5682 | 0.5435 | 0.5333 | 0.5682 | 0.6122 | 0.5219 | 0.5682 | 0.5682 | 0.5682 |
73
+ | 0.4549 | 9.0 | 261 | 1.2969 | 0.5270 | 0.6136 | 0.5648 | 0.6664 | 0.6136 | 0.6757 | 0.5526 | 0.6136 | 0.6136 | 0.6136 |
74
+ | 0.5877 | 10.0 | 290 | 1.3581 | 0.4638 | 0.5758 | 0.5271 | 0.5632 | 0.5758 | 0.6293 | 0.5095 | 0.5758 | 0.5758 | 0.5758 |
75
+ | 0.3451 | 11.0 | 319 | 1.2491 | 0.5613 | 0.6136 | 0.6066 | 0.5909 | 0.6136 | 0.6111 | 0.5589 | 0.6136 | 0.6136 | 0.6136 |
76
+ | 0.4885 | 12.0 | 348 | 1.6862 | 0.5381 | 0.6288 | 0.6087 | 0.5515 | 0.6288 | 0.6225 | 0.5576 | 0.6288 | 0.6288 | 0.6288 |
77
+ | 0.3835 | 13.0 | 377 | 1.8354 | 0.5318 | 0.5379 | 0.5440 | 0.6396 | 0.5379 | 0.6577 | 0.5264 | 0.5379 | 0.5379 | 0.5379 |
 
78
 
79
 
80
  ### Framework versions
81
 
82
+ - Transformers 4.48.2
83
+ - Pytorch 2.6.0+cu124
84
  - Datasets 3.2.0
85
  - Tokenizers 0.21.0
all_results.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
- "epoch": 14.0,
3
- "eval_accuracy": 0.5303030303030303,
4
- "eval_f1_macro": 0.45183408691763977,
5
- "eval_f1_micro": 0.5303030303030303,
6
- "eval_f1_weighted": 0.4968217608209532,
7
- "eval_loss": 1.3036777973175049,
8
- "eval_precision_macro": 0.445124716553288,
9
- "eval_precision_micro": 0.5303030303030303,
10
- "eval_precision_weighted": 0.49716209716209714,
11
- "eval_recall_macro": 0.4879251700680272,
12
- "eval_recall_micro": 0.5303030303030303,
13
- "eval_recall_weighted": 0.5303030303030303,
14
- "eval_runtime": 1.189,
15
- "eval_samples_per_second": 55.508,
16
- "eval_steps_per_second": 7.569,
17
- "total_flos": 5.0124065012763034e+17,
18
- "train_loss": 1.2848335812831748,
19
- "train_runtime": 397.0629,
20
- "train_samples_per_second": 46.542,
21
- "train_steps_per_second": 2.921
22
  }
 
1
  {
2
+ "epoch": 13.0,
3
+ "eval_accuracy": 0.5454545454545454,
4
+ "eval_f1_macro": 0.4568079539508111,
5
+ "eval_f1_micro": 0.5454545454545454,
6
+ "eval_f1_weighted": 0.5111111111111111,
7
+ "eval_loss": 1.2750184535980225,
8
+ "eval_precision_macro": 0.4686224489795919,
9
+ "eval_precision_micro": 0.5454545454545454,
10
+ "eval_precision_weighted": 0.5172619047619048,
11
+ "eval_recall_macro": 0.4845238095238095,
12
+ "eval_recall_micro": 0.5454545454545454,
13
+ "eval_recall_weighted": 0.5454545454545454,
14
+ "eval_runtime": 1.534,
15
+ "eval_samples_per_second": 43.025,
16
+ "eval_steps_per_second": 5.867,
17
+ "total_flos": 1.6452764844550595e+18,
18
+ "train_loss": 1.068754496403651,
19
+ "train_runtime": 506.3845,
20
+ "train_samples_per_second": 36.494,
21
+ "train_steps_per_second": 2.291
22
  }
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "google/vit-base-patch16-224-in21k",
3
  "architectures": [
4
  "ViTForImageClassification"
5
  ],
@@ -7,7 +7,7 @@
7
  "encoder_stride": 16,
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.0,
10
- "hidden_size": 768,
11
  "id2label": {
12
  "0": "-",
13
  "1": "0",
@@ -19,7 +19,7 @@
19
  },
20
  "image_size": 224,
21
  "initializer_range": 0.02,
22
- "intermediate_size": 3072,
23
  "label2id": {
24
  "-": "0",
25
  "0": "1",
@@ -31,12 +31,12 @@
31
  },
32
  "layer_norm_eps": 1e-12,
33
  "model_type": "vit",
34
- "num_attention_heads": 12,
35
  "num_channels": 3,
36
- "num_hidden_layers": 12,
37
  "patch_size": 16,
38
  "problem_type": "single_label_classification",
39
  "qkv_bias": true,
40
  "torch_dtype": "float32",
41
- "transformers_version": "4.48.1"
42
  }
 
1
  {
2
+ "_name_or_path": "google/vit-large-patch16-224",
3
  "architectures": [
4
  "ViTForImageClassification"
5
  ],
 
7
  "encoder_stride": 16,
8
  "hidden_act": "gelu",
9
  "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 1024,
11
  "id2label": {
12
  "0": "-",
13
  "1": "0",
 
19
  },
20
  "image_size": 224,
21
  "initializer_range": 0.02,
22
+ "intermediate_size": 4096,
23
  "label2id": {
24
  "-": "0",
25
  "0": "1",
 
31
  },
32
  "layer_norm_eps": 1e-12,
33
  "model_type": "vit",
34
+ "num_attention_heads": 16,
35
  "num_channels": 3,
36
+ "num_hidden_layers": 24,
37
  "patch_size": 16,
38
  "problem_type": "single_label_classification",
39
  "qkv_bias": true,
40
  "torch_dtype": "float32",
41
+ "transformers_version": "4.48.2"
42
  }
eval_results.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
- "epoch": 14.0,
3
- "eval_accuracy": 0.5303030303030303,
4
- "eval_f1_macro": 0.45183408691763977,
5
- "eval_f1_micro": 0.5303030303030303,
6
- "eval_f1_weighted": 0.4968217608209532,
7
- "eval_loss": 1.3036777973175049,
8
- "eval_precision_macro": 0.445124716553288,
9
- "eval_precision_micro": 0.5303030303030303,
10
- "eval_precision_weighted": 0.49716209716209714,
11
- "eval_recall_macro": 0.4879251700680272,
12
- "eval_recall_micro": 0.5303030303030303,
13
- "eval_recall_weighted": 0.5303030303030303,
14
- "eval_runtime": 1.189,
15
- "eval_samples_per_second": 55.508,
16
- "eval_steps_per_second": 7.569
17
  }
 
1
  {
2
+ "epoch": 13.0,
3
+ "eval_accuracy": 0.5454545454545454,
4
+ "eval_f1_macro": 0.4568079539508111,
5
+ "eval_f1_micro": 0.5454545454545454,
6
+ "eval_f1_weighted": 0.5111111111111111,
7
+ "eval_loss": 1.2750184535980225,
8
+ "eval_precision_macro": 0.4686224489795919,
9
+ "eval_precision_micro": 0.5454545454545454,
10
+ "eval_precision_weighted": 0.5172619047619048,
11
+ "eval_recall_macro": 0.4845238095238095,
12
+ "eval_recall_micro": 0.5454545454545454,
13
+ "eval_recall_weighted": 0.5454545454545454,
14
+ "eval_runtime": 1.534,
15
+ "eval_samples_per_second": 43.025,
16
+ "eval_steps_per_second": 5.867
17
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fde8587a7577d39c089be75f061c2f32599a2e805e69552b9b6de60b76f901c2
3
- size 343239356
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f96947220c3e743da2ce54eb712c8f83c029ec3b367dc6b35a503cf00a770d8
3
+ size 1213281772
runs/Feb02_21-20-23_modal/events.out.tfevents.1738531224.modal.2.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23c5b89b6a4413b367e1a1b4c95870510f2c65fb63e4eaf3fd5c069c21932854
3
+ size 55703
runs/Feb02_21-20-23_modal/events.out.tfevents.1738531224.modal.2.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b07383123646559f20567f54b3e9d4cb0b0ffbb85aca1abcd6dece1d7f1adb9a
3
+ size 55703
runs/Feb02_21-20-23_modal/events.out.tfevents.1738531733.modal.2.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf52770e387ed5821b0aa8719f9132acefa69da5000a038b5aa25d00610cbb14
3
+ size 921
runs/Feb02_21-20-23_modal/events.out.tfevents.1738531733.modal.2.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5ca4a2b0192b10e142b1e24c718c34d1adf9e55ab4ecc5dbdf975d337a4eff
3
+ size 921
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 14.0,
3
- "total_flos": 5.0124065012763034e+17,
4
- "train_loss": 1.2848335812831748,
5
- "train_runtime": 397.0629,
6
- "train_samples_per_second": 46.542,
7
- "train_steps_per_second": 2.921
8
  }
 
1
  {
2
+ "epoch": 13.0,
3
+ "total_flos": 1.6452764844550595e+18,
4
+ "train_loss": 1.068754496403651,
5
+ "train_runtime": 506.3845,
6
+ "train_samples_per_second": 36.494,
7
+ "train_steps_per_second": 2.291
8
  }
trainer_state.json CHANGED
@@ -1,1694 +1,1571 @@
1
  {
2
- "best_metric": 1.1536208391189575,
3
- "best_model_checkpoint": "squarerun_earlystop/checkpoint-319",
4
- "epoch": 14.0,
5
  "eval_steps": 500,
6
- "global_step": 406,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06896551724137931,
13
- "grad_norm": 2.7594025135040283,
14
  "learning_rate": 1.724137931034483e-06,
15
- "loss": 1.9367,
16
  "step": 2
17
  },
18
  {
19
  "epoch": 0.13793103448275862,
20
- "grad_norm": 1.8562699556350708,
21
  "learning_rate": 3.448275862068966e-06,
22
- "loss": 1.9474,
23
  "step": 4
24
  },
25
  {
26
  "epoch": 0.20689655172413793,
27
- "grad_norm": 2.0168490409851074,
28
  "learning_rate": 5.172413793103448e-06,
29
- "loss": 1.9574,
30
  "step": 6
31
  },
32
  {
33
  "epoch": 0.27586206896551724,
34
- "grad_norm": 1.8773648738861084,
35
  "learning_rate": 6.896551724137932e-06,
36
- "loss": 1.9342,
37
  "step": 8
38
  },
39
  {
40
  "epoch": 0.3448275862068966,
41
- "grad_norm": 2.7246925830841064,
42
  "learning_rate": 8.620689655172414e-06,
43
- "loss": 1.9447,
44
  "step": 10
45
  },
46
  {
47
  "epoch": 0.41379310344827586,
48
- "grad_norm": 1.8781946897506714,
49
  "learning_rate": 1.0344827586206897e-05,
50
- "loss": 1.9496,
51
  "step": 12
52
  },
53
  {
54
  "epoch": 0.4827586206896552,
55
- "grad_norm": 1.8623287677764893,
56
  "learning_rate": 1.206896551724138e-05,
57
- "loss": 1.9467,
58
  "step": 14
59
  },
60
  {
61
  "epoch": 0.5517241379310345,
62
- "grad_norm": 1.9005070924758911,
63
  "learning_rate": 1.3793103448275863e-05,
64
- "loss": 1.9104,
65
  "step": 16
66
  },
67
  {
68
  "epoch": 0.6206896551724138,
69
- "grad_norm": 1.8667243719100952,
70
  "learning_rate": 1.5517241379310346e-05,
71
- "loss": 1.8702,
72
  "step": 18
73
  },
74
  {
75
  "epoch": 0.6896551724137931,
76
- "grad_norm": 2.1068148612976074,
77
  "learning_rate": 1.7241379310344828e-05,
78
- "loss": 1.9145,
79
  "step": 20
80
  },
81
  {
82
  "epoch": 0.7586206896551724,
83
- "grad_norm": 1.2688392400741577,
84
  "learning_rate": 1.896551724137931e-05,
85
- "loss": 1.9089,
86
  "step": 22
87
  },
88
  {
89
  "epoch": 0.8275862068965517,
90
- "grad_norm": 1.5856612920761108,
91
  "learning_rate": 2.0689655172413793e-05,
92
- "loss": 1.915,
93
  "step": 24
94
  },
95
  {
96
  "epoch": 0.896551724137931,
97
- "grad_norm": 2.54569935798645,
98
  "learning_rate": 2.2413793103448276e-05,
99
- "loss": 1.8866,
100
  "step": 26
101
  },
102
  {
103
  "epoch": 0.9655172413793104,
104
- "grad_norm": 1.6762207746505737,
105
  "learning_rate": 2.413793103448276e-05,
106
- "loss": 1.9079,
107
  "step": 28
108
  },
109
  {
110
  "epoch": 1.0,
111
- "eval_accuracy": 0.1893939393939394,
112
- "eval_f1_macro": 0.07908887360942154,
113
- "eval_f1_micro": 0.1893939393939394,
114
- "eval_f1_weighted": 0.10867467716782785,
115
- "eval_loss": 1.8821076154708862,
116
- "eval_precision_macro": 0.05654761904761905,
117
- "eval_precision_micro": 0.1893939393939394,
118
- "eval_precision_weighted": 0.07749368686868686,
119
- "eval_recall_macro": 0.13693121693121693,
120
- "eval_recall_micro": 0.1893939393939394,
121
- "eval_recall_weighted": 0.1893939393939394,
122
- "eval_runtime": 2.3123,
123
- "eval_samples_per_second": 57.086,
124
- "eval_steps_per_second": 7.352,
125
  "step": 29
126
  },
127
  {
128
  "epoch": 1.0344827586206897,
129
- "grad_norm": 2.2096035480499268,
130
  "learning_rate": 2.5862068965517244e-05,
131
- "loss": 1.89,
132
  "step": 30
133
  },
134
  {
135
  "epoch": 1.103448275862069,
136
- "grad_norm": 1.5493906736373901,
137
  "learning_rate": 2.7586206896551727e-05,
138
- "loss": 1.8294,
139
  "step": 32
140
  },
141
  {
142
  "epoch": 1.1724137931034484,
143
- "grad_norm": 1.3173717260360718,
144
  "learning_rate": 2.9310344827586206e-05,
145
- "loss": 1.8368,
146
  "step": 34
147
  },
148
  {
149
  "epoch": 1.2413793103448276,
150
- "grad_norm": 1.1894272565841675,
151
  "learning_rate": 3.103448275862069e-05,
152
- "loss": 1.9087,
153
  "step": 36
154
  },
155
  {
156
  "epoch": 1.3103448275862069,
157
- "grad_norm": 1.8291478157043457,
158
  "learning_rate": 3.275862068965517e-05,
159
- "loss": 1.8554,
160
  "step": 38
161
  },
162
  {
163
  "epoch": 1.3793103448275863,
164
- "grad_norm": 1.853824257850647,
165
  "learning_rate": 3.4482758620689657e-05,
166
- "loss": 1.8932,
167
  "step": 40
168
  },
169
  {
170
  "epoch": 1.4482758620689655,
171
- "grad_norm": 1.5440897941589355,
172
  "learning_rate": 3.620689655172414e-05,
173
- "loss": 1.8864,
174
  "step": 42
175
  },
176
  {
177
  "epoch": 1.5172413793103448,
178
- "grad_norm": 2.2528514862060547,
179
  "learning_rate": 3.793103448275862e-05,
180
- "loss": 1.8693,
181
  "step": 44
182
  },
183
  {
184
  "epoch": 1.5862068965517242,
185
- "grad_norm": 1.5831176042556763,
186
  "learning_rate": 3.965517241379311e-05,
187
- "loss": 1.943,
188
  "step": 46
189
  },
190
  {
191
  "epoch": 1.6551724137931034,
192
- "grad_norm": 1.6178966760635376,
193
  "learning_rate": 4.1379310344827587e-05,
194
- "loss": 1.8708,
195
  "step": 48
196
  },
197
  {
198
  "epoch": 1.7241379310344827,
199
- "grad_norm": 1.557183027267456,
200
  "learning_rate": 4.3103448275862066e-05,
201
- "loss": 1.9144,
202
  "step": 50
203
  },
204
  {
205
  "epoch": 1.793103448275862,
206
- "grad_norm": 1.8258789777755737,
207
  "learning_rate": 4.482758620689655e-05,
208
- "loss": 1.8375,
209
  "step": 52
210
  },
211
  {
212
  "epoch": 1.8620689655172413,
213
- "grad_norm": 2.0916709899902344,
214
  "learning_rate": 4.655172413793104e-05,
215
- "loss": 1.9264,
216
  "step": 54
217
  },
218
  {
219
  "epoch": 1.9310344827586206,
220
- "grad_norm": 2.1662747859954834,
221
  "learning_rate": 4.827586206896552e-05,
222
- "loss": 1.8365,
223
  "step": 56
224
  },
225
  {
226
  "epoch": 2.0,
227
- "grad_norm": 1.7327982187271118,
228
  "learning_rate": 5e-05,
229
- "loss": 1.8597,
230
  "step": 58
231
  },
232
  {
233
  "epoch": 2.0,
234
- "eval_accuracy": 0.2878787878787879,
235
- "eval_f1_macro": 0.1861596677647032,
236
- "eval_f1_micro": 0.2878787878787879,
237
- "eval_f1_weighted": 0.23435453610119228,
238
- "eval_loss": 1.8469468355178833,
239
- "eval_precision_macro": 0.18836650821944942,
240
- "eval_precision_micro": 0.2878787878787879,
241
- "eval_precision_weighted": 0.23453479761969068,
242
- "eval_recall_macro": 0.22805744520030236,
243
- "eval_recall_micro": 0.2878787878787879,
244
- "eval_recall_weighted": 0.2878787878787879,
245
- "eval_runtime": 2.319,
246
- "eval_samples_per_second": 56.921,
247
- "eval_steps_per_second": 7.331,
248
  "step": 58
249
  },
250
  {
251
  "epoch": 2.0689655172413794,
252
- "grad_norm": 2.041114091873169,
253
  "learning_rate": 5.172413793103449e-05,
254
- "loss": 1.8907,
255
  "step": 60
256
  },
257
  {
258
  "epoch": 2.1379310344827585,
259
- "grad_norm": 1.7124186754226685,
260
  "learning_rate": 5.344827586206896e-05,
261
- "loss": 1.8079,
262
  "step": 62
263
  },
264
  {
265
  "epoch": 2.206896551724138,
266
- "grad_norm": 2.1433985233306885,
267
  "learning_rate": 5.517241379310345e-05,
268
- "loss": 1.8424,
269
  "step": 64
270
  },
271
  {
272
  "epoch": 2.2758620689655173,
273
- "grad_norm": 2.639194965362549,
274
  "learning_rate": 5.689655172413794e-05,
275
- "loss": 1.907,
276
  "step": 66
277
  },
278
  {
279
  "epoch": 2.344827586206897,
280
- "grad_norm": 1.93711519241333,
281
  "learning_rate": 5.862068965517241e-05,
282
- "loss": 1.7818,
283
  "step": 68
284
  },
285
  {
286
  "epoch": 2.413793103448276,
287
- "grad_norm": 2.6230249404907227,
288
  "learning_rate": 6.03448275862069e-05,
289
- "loss": 1.8183,
290
  "step": 70
291
  },
292
  {
293
  "epoch": 2.4827586206896552,
294
- "grad_norm": 1.7783054113388062,
295
  "learning_rate": 6.206896551724138e-05,
296
- "loss": 1.8002,
297
  "step": 72
298
  },
299
  {
300
  "epoch": 2.5517241379310347,
301
- "grad_norm": 1.6677814722061157,
302
  "learning_rate": 6.379310344827587e-05,
303
- "loss": 1.788,
304
  "step": 74
305
  },
306
  {
307
  "epoch": 2.6206896551724137,
308
- "grad_norm": 2.2913436889648438,
309
  "learning_rate": 6.551724137931034e-05,
310
- "loss": 1.8614,
311
  "step": 76
312
  },
313
  {
314
  "epoch": 2.689655172413793,
315
- "grad_norm": 2.1580638885498047,
316
  "learning_rate": 6.724137931034483e-05,
317
- "loss": 1.9277,
318
  "step": 78
319
  },
320
  {
321
  "epoch": 2.7586206896551726,
322
- "grad_norm": 2.206195831298828,
323
  "learning_rate": 6.896551724137931e-05,
324
- "loss": 1.8815,
325
  "step": 80
326
  },
327
  {
328
  "epoch": 2.8275862068965516,
329
- "grad_norm": 1.749338150024414,
330
  "learning_rate": 7.06896551724138e-05,
331
- "loss": 1.7088,
332
  "step": 82
333
  },
334
  {
335
  "epoch": 2.896551724137931,
336
- "grad_norm": 1.626955270767212,
337
  "learning_rate": 7.241379310344828e-05,
338
- "loss": 1.8376,
339
  "step": 84
340
  },
341
  {
342
  "epoch": 2.9655172413793105,
343
- "grad_norm": 1.9347732067108154,
344
  "learning_rate": 7.413793103448277e-05,
345
- "loss": 1.9027,
346
  "step": 86
347
  },
348
  {
349
  "epoch": 3.0,
350
- "eval_accuracy": 0.3106060606060606,
351
- "eval_f1_macro": 0.17294549642243576,
352
- "eval_f1_micro": 0.3106060606060606,
353
- "eval_f1_weighted": 0.22206095246104496,
354
- "eval_loss": 1.7611310482025146,
355
- "eval_precision_macro": 0.15255920550038196,
356
- "eval_precision_micro": 0.3106060606060606,
357
- "eval_precision_weighted": 0.1940568789499271,
358
- "eval_recall_macro": 0.23585789871504156,
359
- "eval_recall_micro": 0.3106060606060606,
360
- "eval_recall_weighted": 0.3106060606060606,
361
- "eval_runtime": 2.3347,
362
- "eval_samples_per_second": 56.539,
363
- "eval_steps_per_second": 7.282,
364
  "step": 87
365
  },
366
  {
367
  "epoch": 3.0344827586206895,
368
- "grad_norm": 2.2112436294555664,
369
  "learning_rate": 7.586206896551724e-05,
370
- "loss": 1.8942,
371
  "step": 88
372
  },
373
  {
374
  "epoch": 3.103448275862069,
375
- "grad_norm": 1.7872307300567627,
376
  "learning_rate": 7.758620689655173e-05,
377
- "loss": 1.662,
378
  "step": 90
379
  },
380
  {
381
  "epoch": 3.1724137931034484,
382
- "grad_norm": 1.9869877099990845,
383
  "learning_rate": 7.931034482758621e-05,
384
- "loss": 1.7751,
385
  "step": 92
386
  },
387
  {
388
  "epoch": 3.2413793103448274,
389
- "grad_norm": 2.021848440170288,
390
  "learning_rate": 8.103448275862069e-05,
391
- "loss": 1.7261,
392
  "step": 94
393
  },
394
  {
395
  "epoch": 3.310344827586207,
396
- "grad_norm": 3.4642999172210693,
397
  "learning_rate": 8.275862068965517e-05,
398
- "loss": 1.774,
399
  "step": 96
400
  },
401
  {
402
  "epoch": 3.3793103448275863,
403
- "grad_norm": 2.333994150161743,
404
  "learning_rate": 8.448275862068966e-05,
405
- "loss": 1.7085,
406
  "step": 98
407
  },
408
  {
409
  "epoch": 3.4482758620689653,
410
- "grad_norm": 2.7643978595733643,
411
  "learning_rate": 8.620689655172413e-05,
412
- "loss": 1.6867,
413
  "step": 100
414
  },
415
  {
416
  "epoch": 3.5172413793103448,
417
- "grad_norm": 2.4885361194610596,
418
  "learning_rate": 8.793103448275862e-05,
419
- "loss": 1.8514,
420
  "step": 102
421
  },
422
  {
423
  "epoch": 3.586206896551724,
424
- "grad_norm": 1.7879903316497803,
425
  "learning_rate": 8.96551724137931e-05,
426
- "loss": 1.639,
427
  "step": 104
428
  },
429
  {
430
  "epoch": 3.655172413793103,
431
- "grad_norm": 3.6773548126220703,
432
  "learning_rate": 9.137931034482759e-05,
433
- "loss": 1.7757,
434
  "step": 106
435
  },
436
  {
437
  "epoch": 3.7241379310344827,
438
- "grad_norm": 2.4036428928375244,
439
  "learning_rate": 9.310344827586207e-05,
440
- "loss": 1.6385,
441
  "step": 108
442
  },
443
  {
444
  "epoch": 3.793103448275862,
445
- "grad_norm": 2.5688936710357666,
446
  "learning_rate": 9.482758620689656e-05,
447
- "loss": 1.6298,
448
  "step": 110
449
  },
450
  {
451
  "epoch": 3.862068965517241,
452
- "grad_norm": 3.3116631507873535,
453
  "learning_rate": 9.655172413793105e-05,
454
- "loss": 1.7618,
455
  "step": 112
456
  },
457
  {
458
  "epoch": 3.9310344827586206,
459
- "grad_norm": 2.488884925842285,
460
  "learning_rate": 9.827586206896552e-05,
461
- "loss": 1.567,
462
  "step": 114
463
  },
464
  {
465
  "epoch": 4.0,
466
- "grad_norm": 2.22160005569458,
467
  "learning_rate": 0.0001,
468
- "loss": 1.5401,
469
  "step": 116
470
  },
471
  {
472
  "epoch": 4.0,
473
- "eval_accuracy": 0.3787878787878788,
474
- "eval_f1_macro": 0.2465531802753436,
475
- "eval_f1_micro": 0.3787878787878788,
476
- "eval_f1_weighted": 0.3014117539447724,
477
- "eval_loss": 1.630239486694336,
478
- "eval_precision_macro": 0.2601781537007802,
479
- "eval_precision_micro": 0.3787878787878788,
480
- "eval_precision_weighted": 0.30067037235303745,
481
- "eval_recall_macro": 0.29933484504913077,
482
- "eval_recall_micro": 0.3787878787878788,
483
- "eval_recall_weighted": 0.3787878787878788,
484
- "eval_runtime": 2.2953,
485
- "eval_samples_per_second": 57.508,
486
- "eval_steps_per_second": 7.406,
487
  "step": 116
488
  },
489
  {
490
  "epoch": 4.068965517241379,
491
- "grad_norm": 4.651881694793701,
492
  "learning_rate": 9.980842911877395e-05,
493
- "loss": 1.5738,
494
  "step": 118
495
  },
496
  {
497
  "epoch": 4.137931034482759,
498
- "grad_norm": 2.5916430950164795,
499
  "learning_rate": 9.96168582375479e-05,
500
- "loss": 1.5169,
501
  "step": 120
502
  },
503
  {
504
  "epoch": 4.206896551724138,
505
- "grad_norm": 2.5187177658081055,
506
  "learning_rate": 9.942528735632183e-05,
507
- "loss": 1.5412,
508
  "step": 122
509
  },
510
  {
511
  "epoch": 4.275862068965517,
512
- "grad_norm": 3.8361008167266846,
513
  "learning_rate": 9.92337164750958e-05,
514
- "loss": 1.4326,
515
  "step": 124
516
  },
517
  {
518
  "epoch": 4.344827586206897,
519
- "grad_norm": 2.278944969177246,
520
  "learning_rate": 9.904214559386974e-05,
521
- "loss": 1.5543,
522
  "step": 126
523
  },
524
  {
525
  "epoch": 4.413793103448276,
526
- "grad_norm": 3.5142998695373535,
527
  "learning_rate": 9.885057471264369e-05,
528
- "loss": 1.452,
529
  "step": 128
530
  },
531
  {
532
  "epoch": 4.482758620689655,
533
- "grad_norm": 2.6121275424957275,
534
  "learning_rate": 9.865900383141762e-05,
535
- "loss": 1.7841,
536
  "step": 130
537
  },
538
  {
539
  "epoch": 4.551724137931035,
540
- "grad_norm": 3.0184738636016846,
541
  "learning_rate": 9.846743295019157e-05,
542
- "loss": 1.7388,
543
  "step": 132
544
  },
545
  {
546
  "epoch": 4.620689655172414,
547
- "grad_norm": 2.688973903656006,
548
  "learning_rate": 9.827586206896552e-05,
549
- "loss": 1.6655,
550
  "step": 134
551
  },
552
  {
553
  "epoch": 4.689655172413794,
554
- "grad_norm": 3.4905381202697754,
555
  "learning_rate": 9.808429118773947e-05,
556
- "loss": 1.6848,
557
  "step": 136
558
  },
559
  {
560
  "epoch": 4.758620689655173,
561
- "grad_norm": 4.4559783935546875,
562
  "learning_rate": 9.789272030651341e-05,
563
- "loss": 1.5301,
564
  "step": 138
565
  },
566
  {
567
  "epoch": 4.827586206896552,
568
- "grad_norm": 2.9265379905700684,
569
  "learning_rate": 9.770114942528736e-05,
570
- "loss": 1.5221,
571
  "step": 140
572
  },
573
  {
574
  "epoch": 4.896551724137931,
575
- "grad_norm": 3.7354650497436523,
576
  "learning_rate": 9.750957854406131e-05,
577
- "loss": 1.6706,
578
  "step": 142
579
  },
580
  {
581
  "epoch": 4.9655172413793105,
582
- "grad_norm": 1.8896524906158447,
583
  "learning_rate": 9.731800766283526e-05,
584
- "loss": 1.4,
585
  "step": 144
586
  },
587
  {
588
  "epoch": 5.0,
589
- "eval_accuracy": 0.38636363636363635,
590
- "eval_f1_macro": 0.28971739759733683,
591
- "eval_f1_micro": 0.38636363636363635,
592
- "eval_f1_weighted": 0.3245222936047404,
593
- "eval_loss": 1.5179905891418457,
594
- "eval_precision_macro": 0.40333043562252485,
595
- "eval_precision_micro": 0.38636363636363635,
596
- "eval_precision_weighted": 0.4498278935398611,
597
- "eval_recall_macro": 0.342562358276644,
598
- "eval_recall_micro": 0.38636363636363635,
599
- "eval_recall_weighted": 0.38636363636363635,
600
- "eval_runtime": 2.3124,
601
- "eval_samples_per_second": 57.084,
602
- "eval_steps_per_second": 7.352,
603
  "step": 145
604
  },
605
  {
606
  "epoch": 5.0344827586206895,
607
- "grad_norm": 3.1451351642608643,
608
  "learning_rate": 9.71264367816092e-05,
609
- "loss": 1.7405,
610
  "step": 146
611
  },
612
  {
613
  "epoch": 5.103448275862069,
614
- "grad_norm": 3.222198247909546,
615
  "learning_rate": 9.693486590038314e-05,
616
- "loss": 1.4106,
617
  "step": 148
618
  },
619
  {
620
  "epoch": 5.172413793103448,
621
- "grad_norm": 3.350672721862793,
622
  "learning_rate": 9.674329501915709e-05,
623
- "loss": 1.3386,
624
  "step": 150
625
  },
626
  {
627
  "epoch": 5.241379310344827,
628
- "grad_norm": 3.2664127349853516,
629
  "learning_rate": 9.655172413793105e-05,
630
- "loss": 1.2624,
631
  "step": 152
632
  },
633
  {
634
  "epoch": 5.310344827586207,
635
- "grad_norm": 3.0906858444213867,
636
  "learning_rate": 9.6360153256705e-05,
637
- "loss": 1.4569,
638
  "step": 154
639
  },
640
  {
641
  "epoch": 5.379310344827586,
642
- "grad_norm": 3.0391974449157715,
643
  "learning_rate": 9.616858237547893e-05,
644
- "loss": 1.3857,
645
  "step": 156
646
  },
647
  {
648
  "epoch": 5.448275862068965,
649
- "grad_norm": 4.365240573883057,
650
  "learning_rate": 9.597701149425288e-05,
651
- "loss": 1.4081,
652
  "step": 158
653
  },
654
  {
655
  "epoch": 5.517241379310345,
656
- "grad_norm": 2.9517900943756104,
657
  "learning_rate": 9.578544061302682e-05,
658
- "loss": 1.2797,
659
  "step": 160
660
  },
661
  {
662
  "epoch": 5.586206896551724,
663
- "grad_norm": 4.0065693855285645,
664
  "learning_rate": 9.559386973180077e-05,
665
- "loss": 1.6445,
666
  "step": 162
667
  },
668
  {
669
  "epoch": 5.655172413793103,
670
- "grad_norm": 3.3533987998962402,
671
  "learning_rate": 9.540229885057472e-05,
672
- "loss": 1.4004,
673
  "step": 164
674
  },
675
  {
676
  "epoch": 5.724137931034483,
677
- "grad_norm": 2.4490716457366943,
678
  "learning_rate": 9.521072796934867e-05,
679
- "loss": 1.29,
680
  "step": 166
681
  },
682
  {
683
  "epoch": 5.793103448275862,
684
- "grad_norm": 3.6202354431152344,
685
  "learning_rate": 9.501915708812261e-05,
686
- "loss": 1.5721,
687
  "step": 168
688
  },
689
  {
690
  "epoch": 5.862068965517241,
691
- "grad_norm": 3.9141111373901367,
692
  "learning_rate": 9.482758620689656e-05,
693
- "loss": 1.3968,
694
  "step": 170
695
  },
696
  {
697
  "epoch": 5.931034482758621,
698
- "grad_norm": 3.4564762115478516,
699
  "learning_rate": 9.463601532567051e-05,
700
- "loss": 1.7216,
701
  "step": 172
702
  },
703
  {
704
  "epoch": 6.0,
705
- "grad_norm": 3.0670645236968994,
706
  "learning_rate": 9.444444444444444e-05,
707
- "loss": 1.6258,
708
  "step": 174
709
  },
710
  {
711
  "epoch": 6.0,
712
- "eval_accuracy": 0.45454545454545453,
713
- "eval_f1_macro": 0.33417309051028693,
714
- "eval_f1_micro": 0.45454545454545453,
715
- "eval_f1_weighted": 0.39252059608411566,
716
- "eval_loss": 1.4914804697036743,
717
- "eval_precision_macro": 0.4129126120204563,
718
- "eval_precision_micro": 0.45454545454545453,
719
- "eval_precision_weighted": 0.44543409915562315,
720
- "eval_recall_macro": 0.3678155706727135,
721
- "eval_recall_micro": 0.45454545454545453,
722
- "eval_recall_weighted": 0.45454545454545453,
723
- "eval_runtime": 2.3235,
724
- "eval_samples_per_second": 56.81,
725
- "eval_steps_per_second": 7.316,
726
  "step": 174
727
  },
728
  {
729
  "epoch": 6.068965517241379,
730
- "grad_norm": 4.329052925109863,
731
  "learning_rate": 9.425287356321839e-05,
732
- "loss": 1.2878,
733
  "step": 176
734
  },
735
  {
736
  "epoch": 6.137931034482759,
737
- "grad_norm": 3.1268393993377686,
738
  "learning_rate": 9.406130268199235e-05,
739
- "loss": 1.3256,
740
  "step": 178
741
  },
742
  {
743
  "epoch": 6.206896551724138,
744
- "grad_norm": 3.9881491661071777,
745
  "learning_rate": 9.38697318007663e-05,
746
- "loss": 1.3112,
747
  "step": 180
748
  },
749
  {
750
  "epoch": 6.275862068965517,
751
- "grad_norm": 2.776026964187622,
752
  "learning_rate": 9.367816091954023e-05,
753
- "loss": 1.466,
754
  "step": 182
755
  },
756
  {
757
  "epoch": 6.344827586206897,
758
- "grad_norm": 4.8603949546813965,
759
  "learning_rate": 9.348659003831418e-05,
760
- "loss": 1.4044,
761
  "step": 184
762
  },
763
  {
764
  "epoch": 6.413793103448276,
765
- "grad_norm": 2.983745813369751,
766
  "learning_rate": 9.329501915708813e-05,
767
- "loss": 1.1204,
768
  "step": 186
769
  },
770
  {
771
  "epoch": 6.482758620689655,
772
- "grad_norm": 2.8195648193359375,
773
  "learning_rate": 9.310344827586207e-05,
774
- "loss": 1.2529,
775
  "step": 188
776
  },
777
  {
778
  "epoch": 6.551724137931035,
779
- "grad_norm": 2.048353910446167,
780
  "learning_rate": 9.291187739463601e-05,
781
- "loss": 1.2915,
782
  "step": 190
783
  },
784
  {
785
  "epoch": 6.620689655172414,
786
- "grad_norm": 3.2794029712677,
787
  "learning_rate": 9.272030651340997e-05,
788
- "loss": 1.1726,
789
  "step": 192
790
  },
791
  {
792
  "epoch": 6.689655172413794,
793
- "grad_norm": 2.7745888233184814,
794
  "learning_rate": 9.252873563218392e-05,
795
- "loss": 1.2873,
796
  "step": 194
797
  },
798
  {
799
  "epoch": 6.758620689655173,
800
- "grad_norm": 3.1752567291259766,
801
  "learning_rate": 9.233716475095786e-05,
802
- "loss": 1.4572,
803
  "step": 196
804
  },
805
  {
806
  "epoch": 6.827586206896552,
807
- "grad_norm": 2.956439971923828,
808
  "learning_rate": 9.21455938697318e-05,
809
- "loss": 1.15,
810
  "step": 198
811
  },
812
  {
813
  "epoch": 6.896551724137931,
814
- "grad_norm": 4.599544048309326,
815
  "learning_rate": 9.195402298850575e-05,
816
- "loss": 1.0882,
817
  "step": 200
818
  },
819
  {
820
  "epoch": 6.9655172413793105,
821
- "grad_norm": 2.18859601020813,
822
  "learning_rate": 9.17624521072797e-05,
823
- "loss": 1.3579,
824
  "step": 202
825
  },
826
  {
827
  "epoch": 7.0,
828
  "eval_accuracy": 0.5303030303030303,
829
- "eval_f1_macro": 0.4270098934408117,
830
  "eval_f1_micro": 0.5303030303030303,
831
- "eval_f1_weighted": 0.4917944808690925,
832
- "eval_loss": 1.3220996856689453,
833
- "eval_precision_macro": 0.4622853094839513,
834
  "eval_precision_micro": 0.5303030303030303,
835
- "eval_precision_weighted": 0.5269497784546403,
836
- "eval_recall_macro": 0.4613756613756614,
837
  "eval_recall_micro": 0.5303030303030303,
838
  "eval_recall_weighted": 0.5303030303030303,
839
- "eval_runtime": 2.3356,
840
- "eval_samples_per_second": 56.518,
841
- "eval_steps_per_second": 7.279,
842
  "step": 203
843
  },
844
  {
845
  "epoch": 7.0344827586206895,
846
- "grad_norm": 3.5771939754486084,
847
  "learning_rate": 9.157088122605364e-05,
848
- "loss": 1.1677,
849
  "step": 204
850
  },
851
  {
852
  "epoch": 7.103448275862069,
853
- "grad_norm": 4.361037254333496,
854
  "learning_rate": 9.137931034482759e-05,
855
- "loss": 1.1697,
856
  "step": 206
857
  },
858
  {
859
  "epoch": 7.172413793103448,
860
- "grad_norm": 3.966294527053833,
861
  "learning_rate": 9.118773946360154e-05,
862
- "loss": 1.4308,
863
  "step": 208
864
  },
865
  {
866
  "epoch": 7.241379310344827,
867
- "grad_norm": 3.7766764163970947,
868
  "learning_rate": 9.099616858237548e-05,
869
- "loss": 1.2745,
870
  "step": 210
871
  },
872
  {
873
  "epoch": 7.310344827586207,
874
- "grad_norm": 3.12256121635437,
875
  "learning_rate": 9.080459770114943e-05,
876
- "loss": 1.246,
877
  "step": 212
878
  },
879
  {
880
  "epoch": 7.379310344827586,
881
- "grad_norm": 6.705962657928467,
882
  "learning_rate": 9.061302681992338e-05,
883
- "loss": 1.0487,
884
  "step": 214
885
  },
886
  {
887
  "epoch": 7.448275862068965,
888
- "grad_norm": 3.5392026901245117,
889
  "learning_rate": 9.042145593869731e-05,
890
- "loss": 1.3188,
891
  "step": 216
892
  },
893
  {
894
  "epoch": 7.517241379310345,
895
- "grad_norm": 3.6603386402130127,
896
  "learning_rate": 9.022988505747126e-05,
897
- "loss": 1.1814,
898
  "step": 218
899
  },
900
  {
901
  "epoch": 7.586206896551724,
902
- "grad_norm": 3.4975929260253906,
903
  "learning_rate": 9.003831417624522e-05,
904
- "loss": 1.1921,
905
  "step": 220
906
  },
907
  {
908
  "epoch": 7.655172413793103,
909
- "grad_norm": 2.4388949871063232,
910
  "learning_rate": 8.984674329501917e-05,
911
- "loss": 1.1733,
912
  "step": 222
913
  },
914
  {
915
  "epoch": 7.724137931034483,
916
- "grad_norm": 4.060675621032715,
917
  "learning_rate": 8.96551724137931e-05,
918
- "loss": 1.1269,
919
  "step": 224
920
  },
921
  {
922
  "epoch": 7.793103448275862,
923
- "grad_norm": 4.740964412689209,
924
  "learning_rate": 8.946360153256705e-05,
925
- "loss": 1.312,
926
  "step": 226
927
  },
928
  {
929
  "epoch": 7.862068965517241,
930
- "grad_norm": 2.808018922805786,
931
  "learning_rate": 8.9272030651341e-05,
932
- "loss": 1.0676,
933
  "step": 228
934
  },
935
  {
936
  "epoch": 7.931034482758621,
937
- "grad_norm": 3.1474294662475586,
938
  "learning_rate": 8.908045977011495e-05,
939
- "loss": 1.3769,
940
  "step": 230
941
  },
942
  {
943
  "epoch": 8.0,
944
- "grad_norm": 3.1077136993408203,
945
  "learning_rate": 8.888888888888889e-05,
946
- "loss": 1.236,
947
  "step": 232
948
  },
949
  {
950
  "epoch": 8.0,
951
- "eval_accuracy": 0.5303030303030303,
952
- "eval_f1_macro": 0.4404058178034977,
953
- "eval_f1_micro": 0.5303030303030303,
954
- "eval_f1_weighted": 0.5073458619011424,
955
- "eval_loss": 1.3025015592575073,
956
- "eval_precision_macro": 0.4672230220942427,
957
- "eval_precision_micro": 0.5303030303030303,
958
- "eval_precision_weighted": 0.5369953051160421,
959
- "eval_recall_macro": 0.4579365079365079,
960
- "eval_recall_micro": 0.5303030303030303,
961
- "eval_recall_weighted": 0.5303030303030303,
962
- "eval_runtime": 2.3164,
963
- "eval_samples_per_second": 56.984,
964
- "eval_steps_per_second": 7.339,
965
  "step": 232
966
  },
967
  {
968
  "epoch": 8.068965517241379,
969
- "grad_norm": 4.443036079406738,
970
  "learning_rate": 8.869731800766284e-05,
971
- "loss": 1.0253,
972
  "step": 234
973
  },
974
  {
975
  "epoch": 8.137931034482758,
976
- "grad_norm": 4.066115856170654,
977
  "learning_rate": 8.850574712643679e-05,
978
- "loss": 1.0771,
979
  "step": 236
980
  },
981
  {
982
  "epoch": 8.206896551724139,
983
- "grad_norm": 4.220799446105957,
984
  "learning_rate": 8.831417624521074e-05,
985
- "loss": 1.1984,
986
  "step": 238
987
  },
988
  {
989
  "epoch": 8.275862068965518,
990
- "grad_norm": 2.096390962600708,
991
  "learning_rate": 8.812260536398468e-05,
992
- "loss": 1.0571,
993
  "step": 240
994
  },
995
  {
996
  "epoch": 8.344827586206897,
997
- "grad_norm": 2.7211785316467285,
998
  "learning_rate": 8.793103448275862e-05,
999
- "loss": 1.0065,
1000
  "step": 242
1001
  },
1002
  {
1003
  "epoch": 8.413793103448276,
1004
- "grad_norm": 3.908750534057617,
1005
  "learning_rate": 8.773946360153256e-05,
1006
- "loss": 1.146,
1007
  "step": 244
1008
  },
1009
  {
1010
  "epoch": 8.482758620689655,
1011
- "grad_norm": 3.5071165561676025,
1012
  "learning_rate": 8.754789272030651e-05,
1013
- "loss": 1.079,
1014
  "step": 246
1015
  },
1016
  {
1017
  "epoch": 8.551724137931034,
1018
- "grad_norm": 2.588198184967041,
1019
  "learning_rate": 8.735632183908047e-05,
1020
- "loss": 1.0779,
1021
  "step": 248
1022
  },
1023
  {
1024
  "epoch": 8.620689655172415,
1025
- "grad_norm": 2.8031132221221924,
1026
  "learning_rate": 8.716475095785441e-05,
1027
- "loss": 1.2069,
1028
  "step": 250
1029
  },
1030
  {
1031
  "epoch": 8.689655172413794,
1032
- "grad_norm": 3.2936129570007324,
1033
  "learning_rate": 8.697318007662835e-05,
1034
- "loss": 1.0423,
1035
  "step": 252
1036
  },
1037
  {
1038
  "epoch": 8.758620689655173,
1039
- "grad_norm": 3.772505760192871,
1040
  "learning_rate": 8.67816091954023e-05,
1041
- "loss": 1.2322,
1042
  "step": 254
1043
  },
1044
  {
1045
  "epoch": 8.827586206896552,
1046
- "grad_norm": 5.190494060516357,
1047
  "learning_rate": 8.659003831417625e-05,
1048
- "loss": 1.1618,
1049
  "step": 256
1050
  },
1051
  {
1052
  "epoch": 8.89655172413793,
1053
- "grad_norm": 3.416548728942871,
1054
  "learning_rate": 8.63984674329502e-05,
1055
- "loss": 1.2072,
1056
  "step": 258
1057
  },
1058
  {
1059
  "epoch": 8.96551724137931,
1060
- "grad_norm": 3.05737566947937,
1061
  "learning_rate": 8.620689655172413e-05,
1062
- "loss": 0.7554,
1063
  "step": 260
1064
  },
1065
  {
1066
  "epoch": 9.0,
1067
- "eval_accuracy": 0.5227272727272727,
1068
- "eval_f1_macro": 0.4409491354837298,
1069
- "eval_f1_micro": 0.5227272727272727,
1070
- "eval_f1_weighted": 0.5085484244400937,
1071
- "eval_loss": 1.2686671018600464,
1072
- "eval_precision_macro": 0.4555427282335717,
1073
- "eval_precision_micro": 0.5227272727272727,
1074
- "eval_precision_weighted": 0.5195899195431687,
1075
- "eval_recall_macro": 0.45465608465608465,
1076
- "eval_recall_micro": 0.5227272727272727,
1077
- "eval_recall_weighted": 0.5227272727272727,
1078
- "eval_runtime": 2.311,
1079
- "eval_samples_per_second": 57.118,
1080
- "eval_steps_per_second": 7.356,
1081
  "step": 261
1082
  },
1083
  {
1084
  "epoch": 9.03448275862069,
1085
- "grad_norm": 3.317532539367676,
1086
  "learning_rate": 8.601532567049809e-05,
1087
- "loss": 1.0184,
1088
  "step": 262
1089
  },
1090
  {
1091
  "epoch": 9.10344827586207,
1092
- "grad_norm": 2.7105941772460938,
1093
  "learning_rate": 8.582375478927204e-05,
1094
- "loss": 1.001,
1095
  "step": 264
1096
  },
1097
  {
1098
  "epoch": 9.172413793103448,
1099
- "grad_norm": 4.116084575653076,
1100
  "learning_rate": 8.563218390804599e-05,
1101
- "loss": 1.1204,
1102
  "step": 266
1103
  },
1104
  {
1105
  "epoch": 9.241379310344827,
1106
- "grad_norm": 2.512586832046509,
1107
  "learning_rate": 8.544061302681992e-05,
1108
- "loss": 0.784,
1109
  "step": 268
1110
  },
1111
  {
1112
  "epoch": 9.310344827586206,
1113
- "grad_norm": 3.6794638633728027,
1114
  "learning_rate": 8.524904214559387e-05,
1115
- "loss": 0.9,
1116
  "step": 270
1117
  },
1118
  {
1119
  "epoch": 9.379310344827585,
1120
- "grad_norm": 4.9960832595825195,
1121
  "learning_rate": 8.505747126436782e-05,
1122
- "loss": 0.7855,
1123
  "step": 272
1124
  },
1125
  {
1126
  "epoch": 9.448275862068966,
1127
- "grad_norm": 5.506075382232666,
1128
  "learning_rate": 8.486590038314178e-05,
1129
- "loss": 0.9476,
1130
  "step": 274
1131
  },
1132
  {
1133
  "epoch": 9.517241379310345,
1134
- "grad_norm": 4.194758415222168,
1135
  "learning_rate": 8.467432950191571e-05,
1136
- "loss": 0.7085,
1137
  "step": 276
1138
  },
1139
  {
1140
  "epoch": 9.586206896551724,
1141
- "grad_norm": 1.5031163692474365,
1142
  "learning_rate": 8.448275862068966e-05,
1143
- "loss": 0.8127,
1144
  "step": 278
1145
  },
1146
  {
1147
  "epoch": 9.655172413793103,
1148
- "grad_norm": 1.31179678440094,
1149
  "learning_rate": 8.42911877394636e-05,
1150
- "loss": 0.564,
1151
  "step": 280
1152
  },
1153
  {
1154
  "epoch": 9.724137931034482,
1155
- "grad_norm": 3.820150852203369,
1156
  "learning_rate": 8.409961685823755e-05,
1157
- "loss": 1.018,
1158
  "step": 282
1159
  },
1160
  {
1161
  "epoch": 9.793103448275861,
1162
- "grad_norm": 4.5845465660095215,
1163
  "learning_rate": 8.39080459770115e-05,
1164
- "loss": 1.2035,
1165
  "step": 284
1166
  },
1167
  {
1168
  "epoch": 9.862068965517242,
1169
- "grad_norm": 6.150566577911377,
1170
  "learning_rate": 8.371647509578544e-05,
1171
- "loss": 1.0823,
1172
  "step": 286
1173
  },
1174
  {
1175
  "epoch": 9.931034482758621,
1176
- "grad_norm": 4.594326496124268,
1177
  "learning_rate": 8.35249042145594e-05,
1178
- "loss": 0.9801,
1179
  "step": 288
1180
  },
1181
  {
1182
  "epoch": 10.0,
1183
- "grad_norm": 4.550546646118164,
1184
  "learning_rate": 8.333333333333334e-05,
1185
- "loss": 1.2034,
1186
  "step": 290
1187
  },
1188
  {
1189
  "epoch": 10.0,
1190
- "eval_accuracy": 0.5151515151515151,
1191
- "eval_f1_macro": 0.42231551060990824,
1192
- "eval_f1_micro": 0.5151515151515151,
1193
- "eval_f1_weighted": 0.48980373792849813,
1194
- "eval_loss": 1.3682000637054443,
1195
- "eval_precision_macro": 0.4735347621488278,
1196
- "eval_precision_micro": 0.5151515151515151,
1197
- "eval_precision_weighted": 0.5570637716417561,
1198
- "eval_recall_macro": 0.44784580498866217,
1199
- "eval_recall_micro": 0.5151515151515151,
1200
- "eval_recall_weighted": 0.5151515151515151,
1201
- "eval_runtime": 2.3231,
1202
- "eval_samples_per_second": 56.821,
1203
- "eval_steps_per_second": 7.318,
1204
  "step": 290
1205
  },
1206
  {
1207
  "epoch": 10.068965517241379,
1208
- "grad_norm": 5.808166027069092,
1209
  "learning_rate": 8.314176245210729e-05,
1210
- "loss": 1.1703,
1211
  "step": 292
1212
  },
1213
  {
1214
  "epoch": 10.137931034482758,
1215
- "grad_norm": 2.5559470653533936,
1216
  "learning_rate": 8.295019157088123e-05,
1217
- "loss": 0.8235,
1218
  "step": 294
1219
  },
1220
  {
1221
  "epoch": 10.206896551724139,
1222
- "grad_norm": 4.7338762283325195,
1223
  "learning_rate": 8.275862068965517e-05,
1224
- "loss": 0.9399,
1225
  "step": 296
1226
  },
1227
  {
1228
  "epoch": 10.275862068965518,
1229
- "grad_norm": 4.487349987030029,
1230
  "learning_rate": 8.256704980842912e-05,
1231
- "loss": 0.7128,
1232
  "step": 298
1233
  },
1234
  {
1235
  "epoch": 10.344827586206897,
1236
- "grad_norm": 3.328216791152954,
1237
  "learning_rate": 8.237547892720307e-05,
1238
- "loss": 1.0484,
1239
  "step": 300
1240
  },
1241
  {
1242
  "epoch": 10.413793103448276,
1243
- "grad_norm": 2.7354414463043213,
1244
  "learning_rate": 8.218390804597702e-05,
1245
- "loss": 0.73,
1246
  "step": 302
1247
  },
1248
  {
1249
  "epoch": 10.482758620689655,
1250
- "grad_norm": 4.714895248413086,
1251
  "learning_rate": 8.199233716475096e-05,
1252
- "loss": 0.8512,
1253
  "step": 304
1254
  },
1255
  {
1256
  "epoch": 10.551724137931034,
1257
- "grad_norm": 4.107720851898193,
1258
  "learning_rate": 8.180076628352491e-05,
1259
- "loss": 1.033,
1260
  "step": 306
1261
  },
1262
  {
1263
  "epoch": 10.620689655172415,
1264
- "grad_norm": 2.4637210369110107,
1265
  "learning_rate": 8.160919540229886e-05,
1266
- "loss": 0.8492,
1267
  "step": 308
1268
  },
1269
  {
1270
  "epoch": 10.689655172413794,
1271
- "grad_norm": 4.778433799743652,
1272
  "learning_rate": 8.14176245210728e-05,
1273
- "loss": 1.1118,
1274
  "step": 310
1275
  },
1276
  {
1277
  "epoch": 10.758620689655173,
1278
- "grad_norm": 5.476203918457031,
1279
  "learning_rate": 8.122605363984674e-05,
1280
- "loss": 1.1105,
1281
  "step": 312
1282
  },
1283
  {
1284
  "epoch": 10.827586206896552,
1285
- "grad_norm": 4.607093811035156,
1286
  "learning_rate": 8.103448275862069e-05,
1287
- "loss": 0.7392,
1288
  "step": 314
1289
  },
1290
  {
1291
  "epoch": 10.89655172413793,
1292
- "grad_norm": 5.401411056518555,
1293
  "learning_rate": 8.084291187739465e-05,
1294
- "loss": 0.5906,
1295
  "step": 316
1296
  },
1297
  {
1298
  "epoch": 10.96551724137931,
1299
- "grad_norm": 4.555649757385254,
1300
  "learning_rate": 8.06513409961686e-05,
1301
- "loss": 0.8037,
1302
  "step": 318
1303
  },
1304
  {
1305
  "epoch": 11.0,
1306
- "eval_accuracy": 0.6212121212121212,
1307
- "eval_f1_macro": 0.5363613475812564,
1308
- "eval_f1_micro": 0.6212121212121212,
1309
- "eval_f1_weighted": 0.60671912281113,
1310
- "eval_loss": 1.1536208391189575,
1311
- "eval_precision_macro": 0.5305411174976392,
1312
- "eval_precision_micro": 0.6212121212121212,
1313
- "eval_precision_weighted": 0.6092663740885084,
1314
- "eval_recall_macro": 0.555774754346183,
1315
- "eval_recall_micro": 0.6212121212121212,
1316
- "eval_recall_weighted": 0.6212121212121212,
1317
- "eval_runtime": 2.3177,
1318
- "eval_samples_per_second": 56.952,
1319
- "eval_steps_per_second": 7.335,
1320
  "step": 319
1321
  },
1322
  {
1323
  "epoch": 11.03448275862069,
1324
- "grad_norm": 7.6678290367126465,
1325
  "learning_rate": 8.045977011494253e-05,
1326
- "loss": 0.7917,
1327
  "step": 320
1328
  },
1329
  {
1330
  "epoch": 11.10344827586207,
1331
- "grad_norm": 1.9151082038879395,
1332
  "learning_rate": 8.026819923371648e-05,
1333
- "loss": 0.7249,
1334
  "step": 322
1335
  },
1336
  {
1337
  "epoch": 11.172413793103448,
1338
- "grad_norm": 2.772256374359131,
1339
  "learning_rate": 8.007662835249042e-05,
1340
- "loss": 0.7255,
1341
  "step": 324
1342
  },
1343
  {
1344
  "epoch": 11.241379310344827,
1345
- "grad_norm": 2.4357872009277344,
1346
  "learning_rate": 7.988505747126437e-05,
1347
- "loss": 0.5105,
1348
  "step": 326
1349
  },
1350
  {
1351
  "epoch": 11.310344827586206,
1352
- "grad_norm": 4.2995195388793945,
1353
  "learning_rate": 7.969348659003832e-05,
1354
- "loss": 0.8066,
1355
  "step": 328
1356
  },
1357
  {
1358
  "epoch": 11.379310344827585,
1359
- "grad_norm": 2.9026498794555664,
1360
  "learning_rate": 7.950191570881227e-05,
1361
- "loss": 0.9017,
1362
  "step": 330
1363
  },
1364
  {
1365
  "epoch": 11.448275862068966,
1366
- "grad_norm": 4.70675802230835,
1367
  "learning_rate": 7.931034482758621e-05,
1368
- "loss": 1.0516,
1369
  "step": 332
1370
  },
1371
  {
1372
  "epoch": 11.517241379310345,
1373
- "grad_norm": 5.430378437042236,
1374
  "learning_rate": 7.911877394636016e-05,
1375
- "loss": 0.5592,
1376
  "step": 334
1377
  },
1378
  {
1379
  "epoch": 11.586206896551724,
1380
- "grad_norm": 4.292328357696533,
1381
  "learning_rate": 7.892720306513411e-05,
1382
- "loss": 0.691,
1383
  "step": 336
1384
  },
1385
  {
1386
  "epoch": 11.655172413793103,
1387
- "grad_norm": 3.9568679332733154,
1388
  "learning_rate": 7.873563218390804e-05,
1389
- "loss": 0.6625,
1390
  "step": 338
1391
  },
1392
  {
1393
  "epoch": 11.724137931034482,
1394
- "grad_norm": 3.103039503097534,
1395
  "learning_rate": 7.854406130268199e-05,
1396
- "loss": 0.7581,
1397
  "step": 340
1398
  },
1399
  {
1400
  "epoch": 11.793103448275861,
1401
- "grad_norm": 5.845880508422852,
1402
  "learning_rate": 7.835249042145594e-05,
1403
- "loss": 0.7748,
1404
  "step": 342
1405
  },
1406
  {
1407
  "epoch": 11.862068965517242,
1408
- "grad_norm": 3.2300820350646973,
1409
  "learning_rate": 7.81609195402299e-05,
1410
- "loss": 0.6572,
1411
  "step": 344
1412
  },
1413
  {
1414
  "epoch": 11.931034482758621,
1415
- "grad_norm": 5.324902534484863,
1416
  "learning_rate": 7.796934865900383e-05,
1417
- "loss": 0.8584,
1418
  "step": 346
1419
  },
1420
  {
1421
  "epoch": 12.0,
1422
- "grad_norm": 4.982322692871094,
1423
  "learning_rate": 7.777777777777778e-05,
1424
- "loss": 0.593,
1425
  "step": 348
1426
  },
1427
  {
1428
  "epoch": 12.0,
1429
- "eval_accuracy": 0.49242424242424243,
1430
- "eval_f1_macro": 0.44853272706810665,
1431
- "eval_f1_micro": 0.49242424242424243,
1432
- "eval_f1_weighted": 0.5033254246484239,
1433
- "eval_loss": 1.4808580875396729,
1434
- "eval_precision_macro": 0.5443728685107995,
1435
- "eval_precision_micro": 0.49242424242424243,
1436
- "eval_precision_weighted": 0.6404539426091151,
1437
- "eval_recall_macro": 0.47089191232048366,
1438
- "eval_recall_micro": 0.49242424242424243,
1439
- "eval_recall_weighted": 0.49242424242424243,
1440
- "eval_runtime": 2.3168,
1441
- "eval_samples_per_second": 56.975,
1442
- "eval_steps_per_second": 7.338,
1443
  "step": 348
1444
  },
1445
  {
1446
  "epoch": 12.068965517241379,
1447
- "grad_norm": 5.828127384185791,
1448
  "learning_rate": 7.758620689655173e-05,
1449
- "loss": 1.1155,
1450
  "step": 350
1451
  },
1452
  {
1453
  "epoch": 12.137931034482758,
1454
- "grad_norm": 5.41758394241333,
1455
  "learning_rate": 7.739463601532568e-05,
1456
- "loss": 0.8559,
1457
  "step": 352
1458
  },
1459
  {
1460
  "epoch": 12.206896551724139,
1461
- "grad_norm": 4.14806604385376,
1462
  "learning_rate": 7.720306513409961e-05,
1463
- "loss": 0.989,
1464
  "step": 354
1465
  },
1466
  {
1467
  "epoch": 12.275862068965518,
1468
- "grad_norm": 2.6904520988464355,
1469
  "learning_rate": 7.701149425287356e-05,
1470
- "loss": 0.539,
1471
  "step": 356
1472
  },
1473
  {
1474
  "epoch": 12.344827586206897,
1475
- "grad_norm": 1.7748990058898926,
1476
  "learning_rate": 7.681992337164752e-05,
1477
- "loss": 0.6038,
1478
  "step": 358
1479
  },
1480
  {
1481
  "epoch": 12.413793103448276,
1482
- "grad_norm": 2.0579607486724854,
1483
  "learning_rate": 7.662835249042147e-05,
1484
- "loss": 0.7753,
1485
  "step": 360
1486
  },
1487
  {
1488
  "epoch": 12.482758620689655,
1489
- "grad_norm": 2.399552345275879,
1490
  "learning_rate": 7.64367816091954e-05,
1491
- "loss": 0.5238,
1492
  "step": 362
1493
  },
1494
  {
1495
  "epoch": 12.551724137931034,
1496
- "grad_norm": 3.3866896629333496,
1497
  "learning_rate": 7.624521072796935e-05,
1498
- "loss": 0.614,
1499
  "step": 364
1500
  },
1501
  {
1502
  "epoch": 12.620689655172415,
1503
- "grad_norm": 8.123571395874023,
1504
  "learning_rate": 7.60536398467433e-05,
1505
- "loss": 0.6223,
1506
  "step": 366
1507
  },
1508
  {
1509
  "epoch": 12.689655172413794,
1510
- "grad_norm": 7.095142841339111,
1511
  "learning_rate": 7.586206896551724e-05,
1512
- "loss": 0.7668,
1513
  "step": 368
1514
  },
1515
  {
1516
  "epoch": 12.758620689655173,
1517
- "grad_norm": 3.270301103591919,
1518
  "learning_rate": 7.567049808429119e-05,
1519
- "loss": 1.1208,
1520
  "step": 370
1521
  },
1522
  {
1523
  "epoch": 12.827586206896552,
1524
- "grad_norm": 2.6989779472351074,
1525
  "learning_rate": 7.547892720306514e-05,
1526
- "loss": 0.5682,
1527
  "step": 372
1528
  },
1529
  {
1530
  "epoch": 12.89655172413793,
1531
- "grad_norm": 5.336432456970215,
1532
  "learning_rate": 7.528735632183909e-05,
1533
- "loss": 0.6172,
1534
  "step": 374
1535
  },
1536
  {
1537
  "epoch": 12.96551724137931,
1538
- "grad_norm": 6.097706317901611,
1539
  "learning_rate": 7.509578544061303e-05,
1540
- "loss": 0.732,
1541
  "step": 376
1542
  },
1543
  {
1544
  "epoch": 13.0,
1545
- "eval_accuracy": 0.5909090909090909,
1546
- "eval_f1_macro": 0.5144928051331992,
1547
- "eval_f1_micro": 0.5909090909090909,
1548
- "eval_f1_weighted": 0.5808570855592798,
1549
- "eval_loss": 1.1917542219161987,
1550
- "eval_precision_macro": 0.5641553452176186,
1551
- "eval_precision_micro": 0.5909090909090909,
1552
- "eval_precision_weighted": 0.6448181659246691,
1553
- "eval_recall_macro": 0.5287603930461073,
1554
- "eval_recall_micro": 0.5909090909090909,
1555
- "eval_recall_weighted": 0.5909090909090909,
1556
- "eval_runtime": 2.3205,
1557
- "eval_samples_per_second": 56.883,
1558
- "eval_steps_per_second": 7.326,
1559
  "step": 377
1560
  },
1561
  {
1562
- "epoch": 13.03448275862069,
1563
- "grad_norm": 4.893684387207031,
1564
- "learning_rate": 7.490421455938698e-05,
1565
- "loss": 0.7254,
1566
- "step": 378
1567
- },
1568
- {
1569
- "epoch": 13.10344827586207,
1570
- "grad_norm": 3.347688674926758,
1571
- "learning_rate": 7.471264367816091e-05,
1572
- "loss": 0.6572,
1573
- "step": 380
1574
- },
1575
- {
1576
- "epoch": 13.172413793103448,
1577
- "grad_norm": 3.368439197540283,
1578
- "learning_rate": 7.452107279693486e-05,
1579
- "loss": 0.7905,
1580
- "step": 382
1581
- },
1582
- {
1583
- "epoch": 13.241379310344827,
1584
- "grad_norm": 3.221909761428833,
1585
- "learning_rate": 7.432950191570882e-05,
1586
- "loss": 0.748,
1587
- "step": 384
1588
- },
1589
- {
1590
- "epoch": 13.310344827586206,
1591
- "grad_norm": 2.3360860347747803,
1592
- "learning_rate": 7.413793103448277e-05,
1593
- "loss": 0.2969,
1594
- "step": 386
1595
- },
1596
- {
1597
- "epoch": 13.379310344827585,
1598
- "grad_norm": 2.6463725566864014,
1599
- "learning_rate": 7.39463601532567e-05,
1600
- "loss": 0.5682,
1601
- "step": 388
1602
- },
1603
- {
1604
- "epoch": 13.448275862068966,
1605
- "grad_norm": 1.696847677230835,
1606
- "learning_rate": 7.375478927203065e-05,
1607
- "loss": 0.7344,
1608
- "step": 390
1609
- },
1610
- {
1611
- "epoch": 13.517241379310345,
1612
- "grad_norm": 2.6377010345458984,
1613
- "learning_rate": 7.35632183908046e-05,
1614
- "loss": 0.4448,
1615
- "step": 392
1616
- },
1617
- {
1618
- "epoch": 13.586206896551724,
1619
- "grad_norm": 11.842819213867188,
1620
- "learning_rate": 7.337164750957855e-05,
1621
- "loss": 0.6828,
1622
- "step": 394
1623
- },
1624
- {
1625
- "epoch": 13.655172413793103,
1626
- "grad_norm": 4.7342915534973145,
1627
- "learning_rate": 7.31800766283525e-05,
1628
- "loss": 0.8378,
1629
- "step": 396
1630
- },
1631
- {
1632
- "epoch": 13.724137931034482,
1633
- "grad_norm": 4.786142349243164,
1634
- "learning_rate": 7.298850574712644e-05,
1635
- "loss": 0.7508,
1636
- "step": 398
1637
- },
1638
- {
1639
- "epoch": 13.793103448275861,
1640
- "grad_norm": 7.012732028961182,
1641
- "learning_rate": 7.279693486590039e-05,
1642
- "loss": 0.875,
1643
- "step": 400
1644
- },
1645
- {
1646
- "epoch": 13.862068965517242,
1647
- "grad_norm": 7.100485801696777,
1648
- "learning_rate": 7.260536398467434e-05,
1649
- "loss": 0.8155,
1650
- "step": 402
1651
- },
1652
- {
1653
- "epoch": 13.931034482758621,
1654
- "grad_norm": 6.699406623840332,
1655
- "learning_rate": 7.241379310344828e-05,
1656
- "loss": 0.8326,
1657
- "step": 404
1658
- },
1659
- {
1660
- "epoch": 14.0,
1661
- "grad_norm": 3.8491220474243164,
1662
- "learning_rate": 7.222222222222222e-05,
1663
- "loss": 0.8642,
1664
- "step": 406
1665
- },
1666
- {
1667
- "epoch": 14.0,
1668
- "eval_accuracy": 0.5984848484848485,
1669
- "eval_f1_macro": 0.5113535072443615,
1670
- "eval_f1_micro": 0.5984848484848485,
1671
- "eval_f1_weighted": 0.5883891667163884,
1672
- "eval_loss": 1.1884046792984009,
1673
- "eval_precision_macro": 0.5284200509472081,
1674
- "eval_precision_micro": 0.5984848484848485,
1675
- "eval_precision_weighted": 0.610834482351699,
1676
- "eval_recall_macro": 0.5243537414965986,
1677
- "eval_recall_micro": 0.5984848484848485,
1678
- "eval_recall_weighted": 0.5984848484848485,
1679
- "eval_runtime": 2.3348,
1680
- "eval_samples_per_second": 56.536,
1681
- "eval_steps_per_second": 7.281,
1682
- "step": 406
1683
- },
1684
- {
1685
- "epoch": 14.0,
1686
- "step": 406,
1687
- "total_flos": 5.0124065012763034e+17,
1688
- "train_loss": 1.2848335812831748,
1689
- "train_runtime": 397.0629,
1690
- "train_samples_per_second": 46.542,
1691
- "train_steps_per_second": 2.921
1692
  }
1693
  ],
1694
  "logging_steps": 2,
@@ -1699,11 +1576,11 @@
1699
  "stateful_callbacks": {
1700
  "EarlyStoppingCallback": {
1701
  "args": {
1702
- "early_stopping_patience": 3,
1703
  "early_stopping_threshold": 0.0
1704
  },
1705
  "attributes": {
1706
- "early_stopping_patience_counter": 3
1707
  }
1708
  },
1709
  "TrainerControl": {
@@ -1717,7 +1594,7 @@
1717
  "attributes": {}
1718
  }
1719
  },
1720
- "total_flos": 5.0124065012763034e+17,
1721
  "train_batch_size": 8,
1722
  "trial_name": null,
1723
  "trial_params": null
 
1
  {
2
+ "best_metric": 1.150753378868103,
3
+ "best_model_checkpoint": "squarerun_earlystop/checkpoint-145",
4
+ "epoch": 13.0,
5
  "eval_steps": 500,
6
+ "global_step": 377,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.06896551724137931,
13
+ "grad_norm": 19.357681274414062,
14
  "learning_rate": 1.724137931034483e-06,
15
+ "loss": 1.9617,
16
  "step": 2
17
  },
18
  {
19
  "epoch": 0.13793103448275862,
20
+ "grad_norm": 10.07545280456543,
21
  "learning_rate": 3.448275862068966e-06,
22
+ "loss": 2.0829,
23
  "step": 4
24
  },
25
  {
26
  "epoch": 0.20689655172413793,
27
+ "grad_norm": 11.062702178955078,
28
  "learning_rate": 5.172413793103448e-06,
29
+ "loss": 1.948,
30
  "step": 6
31
  },
32
  {
33
  "epoch": 0.27586206896551724,
34
+ "grad_norm": 7.888413429260254,
35
  "learning_rate": 6.896551724137932e-06,
36
+ "loss": 1.9238,
37
  "step": 8
38
  },
39
  {
40
  "epoch": 0.3448275862068966,
41
+ "grad_norm": 12.095210075378418,
42
  "learning_rate": 8.620689655172414e-06,
43
+ "loss": 1.8641,
44
  "step": 10
45
  },
46
  {
47
  "epoch": 0.41379310344827586,
48
+ "grad_norm": 12.549093246459961,
49
  "learning_rate": 1.0344827586206897e-05,
50
+ "loss": 2.1869,
51
  "step": 12
52
  },
53
  {
54
  "epoch": 0.4827586206896552,
55
+ "grad_norm": 10.326892852783203,
56
  "learning_rate": 1.206896551724138e-05,
57
+ "loss": 2.0025,
58
  "step": 14
59
  },
60
  {
61
  "epoch": 0.5517241379310345,
62
+ "grad_norm": 8.114046096801758,
63
  "learning_rate": 1.3793103448275863e-05,
64
+ "loss": 2.0149,
65
  "step": 16
66
  },
67
  {
68
  "epoch": 0.6206896551724138,
69
+ "grad_norm": 9.516056060791016,
70
  "learning_rate": 1.5517241379310346e-05,
71
+ "loss": 1.8154,
72
  "step": 18
73
  },
74
  {
75
  "epoch": 0.6896551724137931,
76
+ "grad_norm": 8.580568313598633,
77
  "learning_rate": 1.7241379310344828e-05,
78
+ "loss": 1.8289,
79
  "step": 20
80
  },
81
  {
82
  "epoch": 0.7586206896551724,
83
+ "grad_norm": 6.422548770904541,
84
  "learning_rate": 1.896551724137931e-05,
85
+ "loss": 1.9361,
86
  "step": 22
87
  },
88
  {
89
  "epoch": 0.8275862068965517,
90
+ "grad_norm": 5.681264400482178,
91
  "learning_rate": 2.0689655172413793e-05,
92
+ "loss": 1.8934,
93
  "step": 24
94
  },
95
  {
96
  "epoch": 0.896551724137931,
97
+ "grad_norm": 11.326376914978027,
98
  "learning_rate": 2.2413793103448276e-05,
99
+ "loss": 1.8371,
100
  "step": 26
101
  },
102
  {
103
  "epoch": 0.9655172413793104,
104
+ "grad_norm": 7.500190258026123,
105
  "learning_rate": 2.413793103448276e-05,
106
+ "loss": 1.9437,
107
  "step": 28
108
  },
109
  {
110
  "epoch": 1.0,
111
+ "eval_accuracy": 0.25757575757575757,
112
+ "eval_f1_macro": 0.14848660111818004,
113
+ "eval_f1_micro": 0.25757575757575757,
114
+ "eval_f1_weighted": 0.16799165746534167,
115
+ "eval_loss": 1.898682713508606,
116
+ "eval_precision_macro": 0.11923314780457638,
117
+ "eval_precision_micro": 0.25757575757575757,
118
+ "eval_precision_weighted": 0.13213301849665487,
119
+ "eval_recall_macro": 0.2206802721088435,
120
+ "eval_recall_micro": 0.25757575757575757,
121
+ "eval_recall_weighted": 0.25757575757575757,
122
+ "eval_runtime": 2.9681,
123
+ "eval_samples_per_second": 44.473,
124
+ "eval_steps_per_second": 5.728,
125
  "step": 29
126
  },
127
  {
128
  "epoch": 1.0344827586206897,
129
+ "grad_norm": 12.50450611114502,
130
  "learning_rate": 2.5862068965517244e-05,
131
+ "loss": 1.9287,
132
  "step": 30
133
  },
134
  {
135
  "epoch": 1.103448275862069,
136
+ "grad_norm": 7.673898696899414,
137
  "learning_rate": 2.7586206896551727e-05,
138
+ "loss": 1.664,
139
  "step": 32
140
  },
141
  {
142
  "epoch": 1.1724137931034484,
143
+ "grad_norm": 5.811491966247559,
144
  "learning_rate": 2.9310344827586206e-05,
145
+ "loss": 1.8535,
146
  "step": 34
147
  },
148
  {
149
  "epoch": 1.2413793103448276,
150
+ "grad_norm": 8.861312866210938,
151
  "learning_rate": 3.103448275862069e-05,
152
+ "loss": 1.9855,
153
  "step": 36
154
  },
155
  {
156
  "epoch": 1.3103448275862069,
157
+ "grad_norm": 8.68062973022461,
158
  "learning_rate": 3.275862068965517e-05,
159
+ "loss": 1.8194,
160
  "step": 38
161
  },
162
  {
163
  "epoch": 1.3793103448275863,
164
+ "grad_norm": 10.850172996520996,
165
  "learning_rate": 3.4482758620689657e-05,
166
+ "loss": 1.8562,
167
  "step": 40
168
  },
169
  {
170
  "epoch": 1.4482758620689655,
171
+ "grad_norm": 7.273862361907959,
172
  "learning_rate": 3.620689655172414e-05,
173
+ "loss": 1.887,
174
  "step": 42
175
  },
176
  {
177
  "epoch": 1.5172413793103448,
178
+ "grad_norm": 9.375494956970215,
179
  "learning_rate": 3.793103448275862e-05,
180
+ "loss": 1.6063,
181
  "step": 44
182
  },
183
  {
184
  "epoch": 1.5862068965517242,
185
+ "grad_norm": 7.466917991638184,
186
  "learning_rate": 3.965517241379311e-05,
187
+ "loss": 1.7104,
188
  "step": 46
189
  },
190
  {
191
  "epoch": 1.6551724137931034,
192
+ "grad_norm": 6.921249866485596,
193
  "learning_rate": 4.1379310344827587e-05,
194
+ "loss": 1.7771,
195
  "step": 48
196
  },
197
  {
198
  "epoch": 1.7241379310344827,
199
+ "grad_norm": 8.85164737701416,
200
  "learning_rate": 4.3103448275862066e-05,
201
+ "loss": 1.7949,
202
  "step": 50
203
  },
204
  {
205
  "epoch": 1.793103448275862,
206
+ "grad_norm": 9.841817855834961,
207
  "learning_rate": 4.482758620689655e-05,
208
+ "loss": 1.7742,
209
  "step": 52
210
  },
211
  {
212
  "epoch": 1.8620689655172413,
213
+ "grad_norm": 9.186838150024414,
214
  "learning_rate": 4.655172413793104e-05,
215
+ "loss": 1.6552,
216
  "step": 54
217
  },
218
  {
219
  "epoch": 1.9310344827586206,
220
+ "grad_norm": 12.096390724182129,
221
  "learning_rate": 4.827586206896552e-05,
222
+ "loss": 1.7408,
223
  "step": 56
224
  },
225
  {
226
  "epoch": 2.0,
227
+ "grad_norm": 7.921807289123535,
228
  "learning_rate": 5e-05,
229
+ "loss": 1.4616,
230
  "step": 58
231
  },
232
  {
233
  "epoch": 2.0,
234
+ "eval_accuracy": 0.42424242424242425,
235
+ "eval_f1_macro": 0.35690013239708984,
236
+ "eval_f1_micro": 0.42424242424242425,
237
+ "eval_f1_weighted": 0.40756753098630294,
238
+ "eval_loss": 1.5844290256500244,
239
+ "eval_precision_macro": 0.4336450032302567,
240
+ "eval_precision_micro": 0.42424242424242425,
241
+ "eval_precision_weighted": 0.473761575667734,
242
+ "eval_recall_macro": 0.3657142857142857,
243
+ "eval_recall_micro": 0.42424242424242425,
244
+ "eval_recall_weighted": 0.42424242424242425,
245
+ "eval_runtime": 2.96,
246
+ "eval_samples_per_second": 44.594,
247
+ "eval_steps_per_second": 5.743,
248
  "step": 58
249
  },
250
  {
251
  "epoch": 2.0689655172413794,
252
+ "grad_norm": 11.485949516296387,
253
  "learning_rate": 5.172413793103449e-05,
254
+ "loss": 1.6275,
255
  "step": 60
256
  },
257
  {
258
  "epoch": 2.1379310344827585,
259
+ "grad_norm": 6.314526081085205,
260
  "learning_rate": 5.344827586206896e-05,
261
+ "loss": 1.396,
262
  "step": 62
263
  },
264
  {
265
  "epoch": 2.206896551724138,
266
+ "grad_norm": 11.086990356445312,
267
  "learning_rate": 5.517241379310345e-05,
268
+ "loss": 1.5553,
269
  "step": 64
270
  },
271
  {
272
  "epoch": 2.2758620689655173,
273
+ "grad_norm": 16.221271514892578,
274
  "learning_rate": 5.689655172413794e-05,
275
+ "loss": 1.6025,
276
  "step": 66
277
  },
278
  {
279
  "epoch": 2.344827586206897,
280
+ "grad_norm": 8.68588924407959,
281
  "learning_rate": 5.862068965517241e-05,
282
+ "loss": 1.435,
283
  "step": 68
284
  },
285
  {
286
  "epoch": 2.413793103448276,
287
+ "grad_norm": 10.737080574035645,
288
  "learning_rate": 6.03448275862069e-05,
289
+ "loss": 1.4304,
290
  "step": 70
291
  },
292
  {
293
  "epoch": 2.4827586206896552,
294
+ "grad_norm": 9.171552658081055,
295
  "learning_rate": 6.206896551724138e-05,
296
+ "loss": 1.5959,
297
  "step": 72
298
  },
299
  {
300
  "epoch": 2.5517241379310347,
301
+ "grad_norm": 10.185091972351074,
302
  "learning_rate": 6.379310344827587e-05,
303
+ "loss": 1.2897,
304
  "step": 74
305
  },
306
  {
307
  "epoch": 2.6206896551724137,
308
+ "grad_norm": 10.52213191986084,
309
  "learning_rate": 6.551724137931034e-05,
310
+ "loss": 1.5851,
311
  "step": 76
312
  },
313
  {
314
  "epoch": 2.689655172413793,
315
+ "grad_norm": 6.222963333129883,
316
  "learning_rate": 6.724137931034483e-05,
317
+ "loss": 1.8139,
318
  "step": 78
319
  },
320
  {
321
  "epoch": 2.7586206896551726,
322
+ "grad_norm": 9.174198150634766,
323
  "learning_rate": 6.896551724137931e-05,
324
+ "loss": 1.4277,
325
  "step": 80
326
  },
327
  {
328
  "epoch": 2.8275862068965516,
329
+ "grad_norm": 11.63707160949707,
330
  "learning_rate": 7.06896551724138e-05,
331
+ "loss": 1.4775,
332
  "step": 82
333
  },
334
  {
335
  "epoch": 2.896551724137931,
336
+ "grad_norm": 9.97467041015625,
337
  "learning_rate": 7.241379310344828e-05,
338
+ "loss": 1.8332,
339
  "step": 84
340
  },
341
  {
342
  "epoch": 2.9655172413793105,
343
+ "grad_norm": 11.130326271057129,
344
  "learning_rate": 7.413793103448277e-05,
345
+ "loss": 1.9935,
346
  "step": 86
347
  },
348
  {
349
  "epoch": 3.0,
350
+ "eval_accuracy": 0.42424242424242425,
351
+ "eval_f1_macro": 0.3059313903206654,
352
+ "eval_f1_micro": 0.42424242424242425,
353
+ "eval_f1_weighted": 0.358521912270671,
354
+ "eval_loss": 1.49520742893219,
355
+ "eval_precision_macro": 0.3794595137633113,
356
+ "eval_precision_micro": 0.42424242424242425,
357
+ "eval_precision_weighted": 0.40968066743383197,
358
+ "eval_recall_macro": 0.33868480725623584,
359
+ "eval_recall_micro": 0.42424242424242425,
360
+ "eval_recall_weighted": 0.42424242424242425,
361
+ "eval_runtime": 2.963,
362
+ "eval_samples_per_second": 44.549,
363
+ "eval_steps_per_second": 5.737,
364
  "step": 87
365
  },
366
  {
367
  "epoch": 3.0344827586206895,
368
+ "grad_norm": 9.817953109741211,
369
  "learning_rate": 7.586206896551724e-05,
370
+ "loss": 1.4279,
371
  "step": 88
372
  },
373
  {
374
  "epoch": 3.103448275862069,
375
+ "grad_norm": 6.149507999420166,
376
  "learning_rate": 7.758620689655173e-05,
377
+ "loss": 1.2738,
378
  "step": 90
379
  },
380
  {
381
  "epoch": 3.1724137931034484,
382
+ "grad_norm": 8.348538398742676,
383
  "learning_rate": 7.931034482758621e-05,
384
+ "loss": 1.464,
385
  "step": 92
386
  },
387
  {
388
  "epoch": 3.2413793103448274,
389
+ "grad_norm": 9.717902183532715,
390
  "learning_rate": 8.103448275862069e-05,
391
+ "loss": 1.4473,
392
  "step": 94
393
  },
394
  {
395
  "epoch": 3.310344827586207,
396
+ "grad_norm": 8.710390090942383,
397
  "learning_rate": 8.275862068965517e-05,
398
+ "loss": 1.1216,
399
  "step": 96
400
  },
401
  {
402
  "epoch": 3.3793103448275863,
403
+ "grad_norm": 9.217482566833496,
404
  "learning_rate": 8.448275862068966e-05,
405
+ "loss": 1.3658,
406
  "step": 98
407
  },
408
  {
409
  "epoch": 3.4482758620689653,
410
+ "grad_norm": 7.890769004821777,
411
  "learning_rate": 8.620689655172413e-05,
412
+ "loss": 1.3025,
413
  "step": 100
414
  },
415
  {
416
  "epoch": 3.5172413793103448,
417
+ "grad_norm": 7.533488750457764,
418
  "learning_rate": 8.793103448275862e-05,
419
+ "loss": 1.6855,
420
  "step": 102
421
  },
422
  {
423
  "epoch": 3.586206896551724,
424
+ "grad_norm": 9.90977954864502,
425
  "learning_rate": 8.96551724137931e-05,
426
+ "loss": 1.36,
427
  "step": 104
428
  },
429
  {
430
  "epoch": 3.655172413793103,
431
+ "grad_norm": 7.196977138519287,
432
  "learning_rate": 9.137931034482759e-05,
433
+ "loss": 1.2334,
434
  "step": 106
435
  },
436
  {
437
  "epoch": 3.7241379310344827,
438
+ "grad_norm": 15.348875999450684,
439
  "learning_rate": 9.310344827586207e-05,
440
+ "loss": 1.3841,
441
  "step": 108
442
  },
443
  {
444
  "epoch": 3.793103448275862,
445
+ "grad_norm": 14.928166389465332,
446
  "learning_rate": 9.482758620689656e-05,
447
+ "loss": 2.0163,
448
  "step": 110
449
  },
450
  {
451
  "epoch": 3.862068965517241,
452
+ "grad_norm": 13.652559280395508,
453
  "learning_rate": 9.655172413793105e-05,
454
+ "loss": 1.6106,
455
  "step": 112
456
  },
457
  {
458
  "epoch": 3.9310344827586206,
459
+ "grad_norm": 10.011820793151855,
460
  "learning_rate": 9.827586206896552e-05,
461
+ "loss": 1.6056,
462
  "step": 114
463
  },
464
  {
465
  "epoch": 4.0,
466
+ "grad_norm": 9.423259735107422,
467
  "learning_rate": 0.0001,
468
+ "loss": 1.3601,
469
  "step": 116
470
  },
471
  {
472
  "epoch": 4.0,
473
+ "eval_accuracy": 0.4166666666666667,
474
+ "eval_f1_macro": 0.3275201621665789,
475
+ "eval_f1_micro": 0.4166666666666667,
476
+ "eval_f1_weighted": 0.37198349822934534,
477
+ "eval_loss": 1.4318833351135254,
478
+ "eval_precision_macro": 0.3223270440251572,
479
+ "eval_precision_micro": 0.4166666666666667,
480
+ "eval_precision_weighted": 0.3618431311398721,
481
+ "eval_recall_macro": 0.361360544217687,
482
+ "eval_recall_micro": 0.4166666666666667,
483
+ "eval_recall_weighted": 0.4166666666666667,
484
+ "eval_runtime": 2.9668,
485
+ "eval_samples_per_second": 44.492,
486
+ "eval_steps_per_second": 5.73,
487
  "step": 116
488
  },
489
  {
490
  "epoch": 4.068965517241379,
491
+ "grad_norm": 12.455545425415039,
492
  "learning_rate": 9.980842911877395e-05,
493
+ "loss": 1.4989,
494
  "step": 118
495
  },
496
  {
497
  "epoch": 4.137931034482759,
498
+ "grad_norm": 7.556772232055664,
499
  "learning_rate": 9.96168582375479e-05,
500
+ "loss": 1.1474,
501
  "step": 120
502
  },
503
  {
504
  "epoch": 4.206896551724138,
505
+ "grad_norm": 7.164824962615967,
506
  "learning_rate": 9.942528735632183e-05,
507
+ "loss": 1.3469,
508
  "step": 122
509
  },
510
  {
511
  "epoch": 4.275862068965517,
512
+ "grad_norm": 12.2306489944458,
513
  "learning_rate": 9.92337164750958e-05,
514
+ "loss": 1.2394,
515
  "step": 124
516
  },
517
  {
518
  "epoch": 4.344827586206897,
519
+ "grad_norm": 10.257525444030762,
520
  "learning_rate": 9.904214559386974e-05,
521
+ "loss": 1.2865,
522
  "step": 126
523
  },
524
  {
525
  "epoch": 4.413793103448276,
526
+ "grad_norm": 8.052595138549805,
527
  "learning_rate": 9.885057471264369e-05,
528
+ "loss": 1.5634,
529
  "step": 128
530
  },
531
  {
532
  "epoch": 4.482758620689655,
533
+ "grad_norm": 10.05372142791748,
534
  "learning_rate": 9.865900383141762e-05,
535
+ "loss": 1.3826,
536
  "step": 130
537
  },
538
  {
539
  "epoch": 4.551724137931035,
540
+ "grad_norm": 8.02147102355957,
541
  "learning_rate": 9.846743295019157e-05,
542
+ "loss": 1.492,
543
  "step": 132
544
  },
545
  {
546
  "epoch": 4.620689655172414,
547
+ "grad_norm": 7.700930118560791,
548
  "learning_rate": 9.827586206896552e-05,
549
+ "loss": 1.4822,
550
  "step": 134
551
  },
552
  {
553
  "epoch": 4.689655172413794,
554
+ "grad_norm": 12.909700393676758,
555
  "learning_rate": 9.808429118773947e-05,
556
+ "loss": 1.617,
557
  "step": 136
558
  },
559
  {
560
  "epoch": 4.758620689655173,
561
+ "grad_norm": 6.239136219024658,
562
  "learning_rate": 9.789272030651341e-05,
563
+ "loss": 1.3277,
564
  "step": 138
565
  },
566
  {
567
  "epoch": 4.827586206896552,
568
+ "grad_norm": 3.8295445442199707,
569
  "learning_rate": 9.770114942528736e-05,
570
+ "loss": 0.8903,
571
  "step": 140
572
  },
573
  {
574
  "epoch": 4.896551724137931,
575
+ "grad_norm": 5.378627777099609,
576
  "learning_rate": 9.750957854406131e-05,
577
+ "loss": 1.1828,
578
  "step": 142
579
  },
580
  {
581
  "epoch": 4.9655172413793105,
582
+ "grad_norm": 7.193126201629639,
583
  "learning_rate": 9.731800766283526e-05,
584
+ "loss": 1.1685,
585
  "step": 144
586
  },
587
  {
588
  "epoch": 5.0,
589
+ "eval_accuracy": 0.5833333333333334,
590
+ "eval_f1_macro": 0.49134435355199774,
591
+ "eval_f1_micro": 0.5833333333333334,
592
+ "eval_f1_weighted": 0.5549812296530736,
593
+ "eval_loss": 1.150753378868103,
594
+ "eval_precision_macro": 0.4886814574314574,
595
+ "eval_precision_micro": 0.5833333333333334,
596
+ "eval_precision_weighted": 0.5484221954392409,
597
+ "eval_recall_macro": 0.5139380196523053,
598
+ "eval_recall_micro": 0.5833333333333334,
599
+ "eval_recall_weighted": 0.5833333333333334,
600
+ "eval_runtime": 2.9609,
601
+ "eval_samples_per_second": 44.58,
602
+ "eval_steps_per_second": 5.741,
603
  "step": 145
604
  },
605
  {
606
  "epoch": 5.0344827586206895,
607
+ "grad_norm": 8.835988998413086,
608
  "learning_rate": 9.71264367816092e-05,
609
+ "loss": 1.4503,
610
  "step": 146
611
  },
612
  {
613
  "epoch": 5.103448275862069,
614
+ "grad_norm": 11.152828216552734,
615
  "learning_rate": 9.693486590038314e-05,
616
+ "loss": 1.1148,
617
  "step": 148
618
  },
619
  {
620
  "epoch": 5.172413793103448,
621
+ "grad_norm": 5.95009183883667,
622
  "learning_rate": 9.674329501915709e-05,
623
+ "loss": 0.7798,
624
  "step": 150
625
  },
626
  {
627
  "epoch": 5.241379310344827,
628
+ "grad_norm": 9.28747272491455,
629
  "learning_rate": 9.655172413793105e-05,
630
+ "loss": 1.071,
631
  "step": 152
632
  },
633
  {
634
  "epoch": 5.310344827586207,
635
+ "grad_norm": 9.649368286132812,
636
  "learning_rate": 9.6360153256705e-05,
637
+ "loss": 1.129,
638
  "step": 154
639
  },
640
  {
641
  "epoch": 5.379310344827586,
642
+ "grad_norm": 6.515026569366455,
643
  "learning_rate": 9.616858237547893e-05,
644
+ "loss": 1.256,
645
  "step": 156
646
  },
647
  {
648
  "epoch": 5.448275862068965,
649
+ "grad_norm": 12.087512969970703,
650
  "learning_rate": 9.597701149425288e-05,
651
+ "loss": 1.8989,
652
  "step": 158
653
  },
654
  {
655
  "epoch": 5.517241379310345,
656
+ "grad_norm": 7.530760288238525,
657
  "learning_rate": 9.578544061302682e-05,
658
+ "loss": 0.9788,
659
  "step": 160
660
  },
661
  {
662
  "epoch": 5.586206896551724,
663
+ "grad_norm": 7.923486709594727,
664
  "learning_rate": 9.559386973180077e-05,
665
+ "loss": 1.2471,
666
  "step": 162
667
  },
668
  {
669
  "epoch": 5.655172413793103,
670
+ "grad_norm": 4.997186183929443,
671
  "learning_rate": 9.540229885057472e-05,
672
+ "loss": 1.1195,
673
  "step": 164
674
  },
675
  {
676
  "epoch": 5.724137931034483,
677
+ "grad_norm": 8.351778030395508,
678
  "learning_rate": 9.521072796934867e-05,
679
+ "loss": 1.2892,
680
  "step": 166
681
  },
682
  {
683
  "epoch": 5.793103448275862,
684
+ "grad_norm": 6.890372276306152,
685
  "learning_rate": 9.501915708812261e-05,
686
+ "loss": 1.1941,
687
  "step": 168
688
  },
689
  {
690
  "epoch": 5.862068965517241,
691
+ "grad_norm": 10.095701217651367,
692
  "learning_rate": 9.482758620689656e-05,
693
+ "loss": 1.1857,
694
  "step": 170
695
  },
696
  {
697
  "epoch": 5.931034482758621,
698
+ "grad_norm": 6.678011417388916,
699
  "learning_rate": 9.463601532567051e-05,
700
+ "loss": 1.1152,
701
  "step": 172
702
  },
703
  {
704
  "epoch": 6.0,
705
+ "grad_norm": 5.188634395599365,
706
  "learning_rate": 9.444444444444444e-05,
707
+ "loss": 1.2228,
708
  "step": 174
709
  },
710
  {
711
  "epoch": 6.0,
712
+ "eval_accuracy": 0.5075757575757576,
713
+ "eval_f1_macro": 0.48649389792977665,
714
+ "eval_f1_micro": 0.5075757575757576,
715
+ "eval_f1_weighted": 0.504583670632264,
716
+ "eval_loss": 1.2663319110870361,
717
+ "eval_precision_macro": 0.5338627386438447,
718
+ "eval_precision_micro": 0.5075757575757576,
719
+ "eval_precision_weighted": 0.5644310326770005,
720
+ "eval_recall_macro": 0.49635676492819353,
721
+ "eval_recall_micro": 0.5075757575757576,
722
+ "eval_recall_weighted": 0.5075757575757576,
723
+ "eval_runtime": 2.9553,
724
+ "eval_samples_per_second": 44.666,
725
+ "eval_steps_per_second": 5.752,
726
  "step": 174
727
  },
728
  {
729
  "epoch": 6.068965517241379,
730
+ "grad_norm": 8.04442024230957,
731
  "learning_rate": 9.425287356321839e-05,
732
+ "loss": 0.813,
733
  "step": 176
734
  },
735
  {
736
  "epoch": 6.137931034482759,
737
+ "grad_norm": 4.239109516143799,
738
  "learning_rate": 9.406130268199235e-05,
739
+ "loss": 0.7334,
740
  "step": 178
741
  },
742
  {
743
  "epoch": 6.206896551724138,
744
+ "grad_norm": 11.466559410095215,
745
  "learning_rate": 9.38697318007663e-05,
746
+ "loss": 0.9938,
747
  "step": 180
748
  },
749
  {
750
  "epoch": 6.275862068965517,
751
+ "grad_norm": 6.6499199867248535,
752
  "learning_rate": 9.367816091954023e-05,
753
+ "loss": 1.4748,
754
  "step": 182
755
  },
756
  {
757
  "epoch": 6.344827586206897,
758
+ "grad_norm": 6.700629234313965,
759
  "learning_rate": 9.348659003831418e-05,
760
+ "loss": 0.7446,
761
  "step": 184
762
  },
763
  {
764
  "epoch": 6.413793103448276,
765
+ "grad_norm": 5.955834865570068,
766
  "learning_rate": 9.329501915708813e-05,
767
+ "loss": 0.5876,
768
  "step": 186
769
  },
770
  {
771
  "epoch": 6.482758620689655,
772
+ "grad_norm": 4.36069393157959,
773
  "learning_rate": 9.310344827586207e-05,
774
+ "loss": 0.6486,
775
  "step": 188
776
  },
777
  {
778
  "epoch": 6.551724137931035,
779
+ "grad_norm": 6.722736358642578,
780
  "learning_rate": 9.291187739463601e-05,
781
+ "loss": 0.9317,
782
  "step": 190
783
  },
784
  {
785
  "epoch": 6.620689655172414,
786
+ "grad_norm": 10.725391387939453,
787
  "learning_rate": 9.272030651340997e-05,
788
+ "loss": 1.0865,
789
  "step": 192
790
  },
791
  {
792
  "epoch": 6.689655172413794,
793
+ "grad_norm": 7.119555950164795,
794
  "learning_rate": 9.252873563218392e-05,
795
+ "loss": 1.0571,
796
  "step": 194
797
  },
798
  {
799
  "epoch": 6.758620689655173,
800
+ "grad_norm": 7.10548210144043,
801
  "learning_rate": 9.233716475095786e-05,
802
+ "loss": 1.2002,
803
  "step": 196
804
  },
805
  {
806
  "epoch": 6.827586206896552,
807
+ "grad_norm": 7.357244968414307,
808
  "learning_rate": 9.21455938697318e-05,
809
+ "loss": 0.9461,
810
  "step": 198
811
  },
812
  {
813
  "epoch": 6.896551724137931,
814
+ "grad_norm": 9.458577156066895,
815
  "learning_rate": 9.195402298850575e-05,
816
+ "loss": 1.0334,
817
  "step": 200
818
  },
819
  {
820
  "epoch": 6.9655172413793105,
821
+ "grad_norm": 7.48079252243042,
822
  "learning_rate": 9.17624521072797e-05,
823
+ "loss": 1.2811,
824
  "step": 202
825
  },
826
  {
827
  "epoch": 7.0,
828
  "eval_accuracy": 0.5303030303030303,
829
+ "eval_f1_macro": 0.40842438013755056,
830
  "eval_f1_micro": 0.5303030303030303,
831
+ "eval_f1_weighted": 0.4751581109098944,
832
+ "eval_loss": 1.4595577716827393,
833
+ "eval_precision_macro": 0.5581683833407971,
834
  "eval_precision_micro": 0.5303030303030303,
835
+ "eval_precision_weighted": 0.6067953833000855,
836
+ "eval_recall_macro": 0.43830687830687826,
837
  "eval_recall_micro": 0.5303030303030303,
838
  "eval_recall_weighted": 0.5303030303030303,
839
+ "eval_runtime": 2.9603,
840
+ "eval_samples_per_second": 44.59,
841
+ "eval_steps_per_second": 5.743,
842
  "step": 203
843
  },
844
  {
845
  "epoch": 7.0344827586206895,
846
+ "grad_norm": 6.276855945587158,
847
  "learning_rate": 9.157088122605364e-05,
848
+ "loss": 1.2212,
849
  "step": 204
850
  },
851
  {
852
  "epoch": 7.103448275862069,
853
+ "grad_norm": 13.91729736328125,
854
  "learning_rate": 9.137931034482759e-05,
855
+ "loss": 0.9592,
856
  "step": 206
857
  },
858
  {
859
  "epoch": 7.172413793103448,
860
+ "grad_norm": 8.206171035766602,
861
  "learning_rate": 9.118773946360154e-05,
862
+ "loss": 1.0261,
863
  "step": 208
864
  },
865
  {
866
  "epoch": 7.241379310344827,
867
+ "grad_norm": 6.51706075668335,
868
  "learning_rate": 9.099616858237548e-05,
869
+ "loss": 1.0831,
870
  "step": 210
871
  },
872
  {
873
  "epoch": 7.310344827586207,
874
+ "grad_norm": 10.827319145202637,
875
  "learning_rate": 9.080459770114943e-05,
876
+ "loss": 1.048,
877
  "step": 212
878
  },
879
  {
880
  "epoch": 7.379310344827586,
881
+ "grad_norm": 8.241643905639648,
882
  "learning_rate": 9.061302681992338e-05,
883
+ "loss": 1.0182,
884
  "step": 214
885
  },
886
  {
887
  "epoch": 7.448275862068965,
888
+ "grad_norm": 6.14633321762085,
889
  "learning_rate": 9.042145593869731e-05,
890
+ "loss": 0.6089,
891
  "step": 216
892
  },
893
  {
894
  "epoch": 7.517241379310345,
895
+ "grad_norm": 7.247577667236328,
896
  "learning_rate": 9.022988505747126e-05,
897
+ "loss": 0.775,
898
  "step": 218
899
  },
900
  {
901
  "epoch": 7.586206896551724,
902
+ "grad_norm": 8.919748306274414,
903
  "learning_rate": 9.003831417624522e-05,
904
+ "loss": 0.9039,
905
  "step": 220
906
  },
907
  {
908
  "epoch": 7.655172413793103,
909
+ "grad_norm": 7.243560791015625,
910
  "learning_rate": 8.984674329501917e-05,
911
+ "loss": 0.7609,
912
  "step": 222
913
  },
914
  {
915
  "epoch": 7.724137931034483,
916
+ "grad_norm": 10.010783195495605,
917
  "learning_rate": 8.96551724137931e-05,
918
+ "loss": 0.7431,
919
  "step": 224
920
  },
921
  {
922
  "epoch": 7.793103448275862,
923
+ "grad_norm": 11.02781867980957,
924
  "learning_rate": 8.946360153256705e-05,
925
+ "loss": 1.5126,
926
  "step": 226
927
  },
928
  {
929
  "epoch": 7.862068965517241,
930
+ "grad_norm": 7.93005895614624,
931
  "learning_rate": 8.9272030651341e-05,
932
+ "loss": 0.5022,
933
  "step": 228
934
  },
935
  {
936
  "epoch": 7.931034482758621,
937
+ "grad_norm": 12.424070358276367,
938
  "learning_rate": 8.908045977011495e-05,
939
+ "loss": 0.9272,
940
  "step": 230
941
  },
942
  {
943
  "epoch": 8.0,
944
+ "grad_norm": 12.335293769836426,
945
  "learning_rate": 8.888888888888889e-05,
946
+ "loss": 1.7256,
947
  "step": 232
948
  },
949
  {
950
  "epoch": 8.0,
951
+ "eval_accuracy": 0.5681818181818182,
952
+ "eval_f1_macro": 0.4805299838894464,
953
+ "eval_f1_micro": 0.5681818181818182,
954
+ "eval_f1_weighted": 0.5434771053711297,
955
+ "eval_loss": 1.490796446800232,
956
+ "eval_precision_macro": 0.5332785669808334,
957
+ "eval_precision_micro": 0.5681818181818182,
958
+ "eval_precision_weighted": 0.6122299205283115,
959
+ "eval_recall_macro": 0.5219198790627363,
960
+ "eval_recall_micro": 0.5681818181818182,
961
+ "eval_recall_weighted": 0.5681818181818182,
962
+ "eval_runtime": 2.9681,
963
+ "eval_samples_per_second": 44.473,
964
+ "eval_steps_per_second": 5.728,
965
  "step": 232
966
  },
967
  {
968
  "epoch": 8.068965517241379,
969
+ "grad_norm": 8.515727996826172,
970
  "learning_rate": 8.869731800766284e-05,
971
+ "loss": 0.6998,
972
  "step": 234
973
  },
974
  {
975
  "epoch": 8.137931034482758,
976
+ "grad_norm": 6.3350934982299805,
977
  "learning_rate": 8.850574712643679e-05,
978
+ "loss": 0.6902,
979
  "step": 236
980
  },
981
  {
982
  "epoch": 8.206896551724139,
983
+ "grad_norm": 6.2341132164001465,
984
  "learning_rate": 8.831417624521074e-05,
985
+ "loss": 0.7862,
986
  "step": 238
987
  },
988
  {
989
  "epoch": 8.275862068965518,
990
+ "grad_norm": 5.668937683105469,
991
  "learning_rate": 8.812260536398468e-05,
992
+ "loss": 0.8759,
993
  "step": 240
994
  },
995
  {
996
  "epoch": 8.344827586206897,
997
+ "grad_norm": 6.639297962188721,
998
  "learning_rate": 8.793103448275862e-05,
999
+ "loss": 0.5213,
1000
  "step": 242
1001
  },
1002
  {
1003
  "epoch": 8.413793103448276,
1004
+ "grad_norm": 8.357873916625977,
1005
  "learning_rate": 8.773946360153256e-05,
1006
+ "loss": 0.7463,
1007
  "step": 244
1008
  },
1009
  {
1010
  "epoch": 8.482758620689655,
1011
+ "grad_norm": 6.915083885192871,
1012
  "learning_rate": 8.754789272030651e-05,
1013
+ "loss": 0.7049,
1014
  "step": 246
1015
  },
1016
  {
1017
  "epoch": 8.551724137931034,
1018
+ "grad_norm": 7.055286407470703,
1019
  "learning_rate": 8.735632183908047e-05,
1020
+ "loss": 0.731,
1021
  "step": 248
1022
  },
1023
  {
1024
  "epoch": 8.620689655172415,
1025
+ "grad_norm": 10.977728843688965,
1026
  "learning_rate": 8.716475095785441e-05,
1027
+ "loss": 0.9833,
1028
  "step": 250
1029
  },
1030
  {
1031
  "epoch": 8.689655172413794,
1032
+ "grad_norm": 10.40459156036377,
1033
  "learning_rate": 8.697318007662835e-05,
1034
+ "loss": 0.6566,
1035
  "step": 252
1036
  },
1037
  {
1038
  "epoch": 8.758620689655173,
1039
+ "grad_norm": 7.625058650970459,
1040
  "learning_rate": 8.67816091954023e-05,
1041
+ "loss": 0.9152,
1042
  "step": 254
1043
  },
1044
  {
1045
  "epoch": 8.827586206896552,
1046
+ "grad_norm": 12.976395606994629,
1047
  "learning_rate": 8.659003831417625e-05,
1048
+ "loss": 1.1152,
1049
  "step": 256
1050
  },
1051
  {
1052
  "epoch": 8.89655172413793,
1053
+ "grad_norm": 7.554351329803467,
1054
  "learning_rate": 8.63984674329502e-05,
1055
+ "loss": 0.794,
1056
  "step": 258
1057
  },
1058
  {
1059
  "epoch": 8.96551724137931,
1060
+ "grad_norm": 10.101217269897461,
1061
  "learning_rate": 8.620689655172413e-05,
1062
+ "loss": 0.4549,
1063
  "step": 260
1064
  },
1065
  {
1066
  "epoch": 9.0,
1067
+ "eval_accuracy": 0.6136363636363636,
1068
+ "eval_f1_macro": 0.5270088365794784,
1069
+ "eval_f1_micro": 0.6136363636363636,
1070
+ "eval_f1_weighted": 0.5647711520056131,
1071
+ "eval_loss": 1.296909213066101,
1072
+ "eval_precision_macro": 0.6663623344074471,
1073
+ "eval_precision_micro": 0.6136363636363636,
1074
+ "eval_precision_weighted": 0.675660742766006,
1075
+ "eval_recall_macro": 0.5526152683295541,
1076
+ "eval_recall_micro": 0.6136363636363636,
1077
+ "eval_recall_weighted": 0.6136363636363636,
1078
+ "eval_runtime": 2.9637,
1079
+ "eval_samples_per_second": 44.538,
1080
+ "eval_steps_per_second": 5.736,
1081
  "step": 261
1082
  },
1083
  {
1084
  "epoch": 9.03448275862069,
1085
+ "grad_norm": 7.993361949920654,
1086
  "learning_rate": 8.601532567049809e-05,
1087
+ "loss": 0.9663,
1088
  "step": 262
1089
  },
1090
  {
1091
  "epoch": 9.10344827586207,
1092
+ "grad_norm": 7.356778621673584,
1093
  "learning_rate": 8.582375478927204e-05,
1094
+ "loss": 0.8349,
1095
  "step": 264
1096
  },
1097
  {
1098
  "epoch": 9.172413793103448,
1099
+ "grad_norm": 7.286875247955322,
1100
  "learning_rate": 8.563218390804599e-05,
1101
+ "loss": 0.6199,
1102
  "step": 266
1103
  },
1104
  {
1105
  "epoch": 9.241379310344827,
1106
+ "grad_norm": 7.223015785217285,
1107
  "learning_rate": 8.544061302681992e-05,
1108
+ "loss": 0.389,
1109
  "step": 268
1110
  },
1111
  {
1112
  "epoch": 9.310344827586206,
1113
+ "grad_norm": 8.316946029663086,
1114
  "learning_rate": 8.524904214559387e-05,
1115
+ "loss": 0.4572,
1116
  "step": 270
1117
  },
1118
  {
1119
  "epoch": 9.379310344827585,
1120
+ "grad_norm": 9.565052032470703,
1121
  "learning_rate": 8.505747126436782e-05,
1122
+ "loss": 0.5443,
1123
  "step": 272
1124
  },
1125
  {
1126
  "epoch": 9.448275862068966,
1127
+ "grad_norm": 7.592960357666016,
1128
  "learning_rate": 8.486590038314178e-05,
1129
+ "loss": 0.6893,
1130
  "step": 274
1131
  },
1132
  {
1133
  "epoch": 9.517241379310345,
1134
+ "grad_norm": 9.539419174194336,
1135
  "learning_rate": 8.467432950191571e-05,
1136
+ "loss": 0.485,
1137
  "step": 276
1138
  },
1139
  {
1140
  "epoch": 9.586206896551724,
1141
+ "grad_norm": 5.9389729499816895,
1142
  "learning_rate": 8.448275862068966e-05,
1143
+ "loss": 0.4296,
1144
  "step": 278
1145
  },
1146
  {
1147
  "epoch": 9.655172413793103,
1148
+ "grad_norm": 7.698464393615723,
1149
  "learning_rate": 8.42911877394636e-05,
1150
+ "loss": 0.361,
1151
  "step": 280
1152
  },
1153
  {
1154
  "epoch": 9.724137931034482,
1155
+ "grad_norm": 7.502455234527588,
1156
  "learning_rate": 8.409961685823755e-05,
1157
+ "loss": 0.6901,
1158
  "step": 282
1159
  },
1160
  {
1161
  "epoch": 9.793103448275861,
1162
+ "grad_norm": 6.852280616760254,
1163
  "learning_rate": 8.39080459770115e-05,
1164
+ "loss": 0.8456,
1165
  "step": 284
1166
  },
1167
  {
1168
  "epoch": 9.862068965517242,
1169
+ "grad_norm": 12.406436920166016,
1170
  "learning_rate": 8.371647509578544e-05,
1171
+ "loss": 0.6653,
1172
  "step": 286
1173
  },
1174
  {
1175
  "epoch": 9.931034482758621,
1176
+ "grad_norm": 9.554862022399902,
1177
  "learning_rate": 8.35249042145594e-05,
1178
+ "loss": 0.6775,
1179
  "step": 288
1180
  },
1181
  {
1182
  "epoch": 10.0,
1183
+ "grad_norm": 4.816894054412842,
1184
  "learning_rate": 8.333333333333334e-05,
1185
+ "loss": 0.5877,
1186
  "step": 290
1187
  },
1188
  {
1189
  "epoch": 10.0,
1190
+ "eval_accuracy": 0.5757575757575758,
1191
+ "eval_f1_macro": 0.46384154004925054,
1192
+ "eval_f1_micro": 0.5757575757575758,
1193
+ "eval_f1_weighted": 0.5271337941318629,
1194
+ "eval_loss": 1.358140468597412,
1195
+ "eval_precision_macro": 0.5631969710961309,
1196
+ "eval_precision_micro": 0.5757575757575758,
1197
+ "eval_precision_weighted": 0.6293128440187263,
1198
+ "eval_recall_macro": 0.5095389266817837,
1199
+ "eval_recall_micro": 0.5757575757575758,
1200
+ "eval_recall_weighted": 0.5757575757575758,
1201
+ "eval_runtime": 2.9551,
1202
+ "eval_samples_per_second": 44.668,
1203
+ "eval_steps_per_second": 5.753,
1204
  "step": 290
1205
  },
1206
  {
1207
  "epoch": 10.068965517241379,
1208
+ "grad_norm": 12.736379623413086,
1209
  "learning_rate": 8.314176245210729e-05,
1210
+ "loss": 0.522,
1211
  "step": 292
1212
  },
1213
  {
1214
  "epoch": 10.137931034482758,
1215
+ "grad_norm": 7.122078895568848,
1216
  "learning_rate": 8.295019157088123e-05,
1217
+ "loss": 0.572,
1218
  "step": 294
1219
  },
1220
  {
1221
  "epoch": 10.206896551724139,
1222
+ "grad_norm": 8.840079307556152,
1223
  "learning_rate": 8.275862068965517e-05,
1224
+ "loss": 0.6279,
1225
  "step": 296
1226
  },
1227
  {
1228
  "epoch": 10.275862068965518,
1229
+ "grad_norm": 8.563687324523926,
1230
  "learning_rate": 8.256704980842912e-05,
1231
+ "loss": 0.5381,
1232
  "step": 298
1233
  },
1234
  {
1235
  "epoch": 10.344827586206897,
1236
+ "grad_norm": 4.0482587814331055,
1237
  "learning_rate": 8.237547892720307e-05,
1238
+ "loss": 0.3322,
1239
  "step": 300
1240
  },
1241
  {
1242
  "epoch": 10.413793103448276,
1243
+ "grad_norm": 8.282402038574219,
1244
  "learning_rate": 8.218390804597702e-05,
1245
+ "loss": 0.4547,
1246
  "step": 302
1247
  },
1248
  {
1249
  "epoch": 10.482758620689655,
1250
+ "grad_norm": 5.410398960113525,
1251
  "learning_rate": 8.199233716475096e-05,
1252
+ "loss": 0.4316,
1253
  "step": 304
1254
  },
1255
  {
1256
  "epoch": 10.551724137931034,
1257
+ "grad_norm": 9.745489120483398,
1258
  "learning_rate": 8.180076628352491e-05,
1259
+ "loss": 0.9735,
1260
  "step": 306
1261
  },
1262
  {
1263
  "epoch": 10.620689655172415,
1264
+ "grad_norm": 5.827289581298828,
1265
  "learning_rate": 8.160919540229886e-05,
1266
+ "loss": 0.3934,
1267
  "step": 308
1268
  },
1269
  {
1270
  "epoch": 10.689655172413794,
1271
+ "grad_norm": 12.345109939575195,
1272
  "learning_rate": 8.14176245210728e-05,
1273
+ "loss": 0.8064,
1274
  "step": 310
1275
  },
1276
  {
1277
  "epoch": 10.758620689655173,
1278
+ "grad_norm": 9.329832077026367,
1279
  "learning_rate": 8.122605363984674e-05,
1280
+ "loss": 0.6792,
1281
  "step": 312
1282
  },
1283
  {
1284
  "epoch": 10.827586206896552,
1285
+ "grad_norm": 5.181191444396973,
1286
  "learning_rate": 8.103448275862069e-05,
1287
+ "loss": 0.4054,
1288
  "step": 314
1289
  },
1290
  {
1291
  "epoch": 10.89655172413793,
1292
+ "grad_norm": 7.552645206451416,
1293
  "learning_rate": 8.084291187739465e-05,
1294
+ "loss": 0.1956,
1295
  "step": 316
1296
  },
1297
  {
1298
  "epoch": 10.96551724137931,
1299
+ "grad_norm": 5.599520206451416,
1300
  "learning_rate": 8.06513409961686e-05,
1301
+ "loss": 0.3451,
1302
  "step": 318
1303
  },
1304
  {
1305
  "epoch": 11.0,
1306
+ "eval_accuracy": 0.6136363636363636,
1307
+ "eval_f1_macro": 0.5613081171482324,
1308
+ "eval_f1_micro": 0.6136363636363636,
1309
+ "eval_f1_weighted": 0.6066036873275266,
1310
+ "eval_loss": 1.2490617036819458,
1311
+ "eval_precision_macro": 0.5909090909090909,
1312
+ "eval_precision_micro": 0.6136363636363636,
1313
+ "eval_precision_weighted": 0.6110537190082644,
1314
+ "eval_recall_macro": 0.5589115646258503,
1315
+ "eval_recall_micro": 0.6136363636363636,
1316
+ "eval_recall_weighted": 0.6136363636363636,
1317
+ "eval_runtime": 2.9702,
1318
+ "eval_samples_per_second": 44.441,
1319
+ "eval_steps_per_second": 5.723,
1320
  "step": 319
1321
  },
1322
  {
1323
  "epoch": 11.03448275862069,
1324
+ "grad_norm": 4.307100772857666,
1325
  "learning_rate": 8.045977011494253e-05,
1326
+ "loss": 0.2913,
1327
  "step": 320
1328
  },
1329
  {
1330
  "epoch": 11.10344827586207,
1331
+ "grad_norm": 4.337777137756348,
1332
  "learning_rate": 8.026819923371648e-05,
1333
+ "loss": 0.223,
1334
  "step": 322
1335
  },
1336
  {
1337
  "epoch": 11.172413793103448,
1338
+ "grad_norm": 5.760768413543701,
1339
  "learning_rate": 8.007662835249042e-05,
1340
+ "loss": 0.4768,
1341
  "step": 324
1342
  },
1343
  {
1344
  "epoch": 11.241379310344827,
1345
+ "grad_norm": 11.01797103881836,
1346
  "learning_rate": 7.988505747126437e-05,
1347
+ "loss": 0.4892,
1348
  "step": 326
1349
  },
1350
  {
1351
  "epoch": 11.310344827586206,
1352
+ "grad_norm": 8.456830024719238,
1353
  "learning_rate": 7.969348659003832e-05,
1354
+ "loss": 0.3436,
1355
  "step": 328
1356
  },
1357
  {
1358
  "epoch": 11.379310344827585,
1359
+ "grad_norm": 8.432597160339355,
1360
  "learning_rate": 7.950191570881227e-05,
1361
+ "loss": 0.406,
1362
  "step": 330
1363
  },
1364
  {
1365
  "epoch": 11.448275862068966,
1366
+ "grad_norm": 10.220959663391113,
1367
  "learning_rate": 7.931034482758621e-05,
1368
+ "loss": 0.3734,
1369
  "step": 332
1370
  },
1371
  {
1372
  "epoch": 11.517241379310345,
1373
+ "grad_norm": 5.659406661987305,
1374
  "learning_rate": 7.911877394636016e-05,
1375
+ "loss": 0.2822,
1376
  "step": 334
1377
  },
1378
  {
1379
  "epoch": 11.586206896551724,
1380
+ "grad_norm": 1.9529658555984497,
1381
  "learning_rate": 7.892720306513411e-05,
1382
+ "loss": 0.3656,
1383
  "step": 336
1384
  },
1385
  {
1386
  "epoch": 11.655172413793103,
1387
+ "grad_norm": 6.910799980163574,
1388
  "learning_rate": 7.873563218390804e-05,
1389
+ "loss": 0.5396,
1390
  "step": 338
1391
  },
1392
  {
1393
  "epoch": 11.724137931034482,
1394
+ "grad_norm": 9.418132781982422,
1395
  "learning_rate": 7.854406130268199e-05,
1396
+ "loss": 0.4497,
1397
  "step": 340
1398
  },
1399
  {
1400
  "epoch": 11.793103448275861,
1401
+ "grad_norm": 10.316533088684082,
1402
  "learning_rate": 7.835249042145594e-05,
1403
+ "loss": 0.3782,
1404
  "step": 342
1405
  },
1406
  {
1407
  "epoch": 11.862068965517242,
1408
+ "grad_norm": 5.654300689697266,
1409
  "learning_rate": 7.81609195402299e-05,
1410
+ "loss": 0.3447,
1411
  "step": 344
1412
  },
1413
  {
1414
  "epoch": 11.931034482758621,
1415
+ "grad_norm": 6.8490800857543945,
1416
  "learning_rate": 7.796934865900383e-05,
1417
+ "loss": 0.205,
1418
  "step": 346
1419
  },
1420
  {
1421
  "epoch": 12.0,
1422
+ "grad_norm": 10.45013427734375,
1423
  "learning_rate": 7.777777777777778e-05,
1424
+ "loss": 0.4885,
1425
  "step": 348
1426
  },
1427
  {
1428
  "epoch": 12.0,
1429
+ "eval_accuracy": 0.6287878787878788,
1430
+ "eval_f1_macro": 0.5380526246743514,
1431
+ "eval_f1_micro": 0.6287878787878788,
1432
+ "eval_f1_weighted": 0.6086896087349808,
1433
+ "eval_loss": 1.6861677169799805,
1434
+ "eval_precision_macro": 0.5514896867838044,
1435
+ "eval_precision_micro": 0.6287878787878788,
1436
+ "eval_precision_weighted": 0.6224606432716949,
1437
+ "eval_recall_macro": 0.5575736961451246,
1438
+ "eval_recall_micro": 0.6287878787878788,
1439
+ "eval_recall_weighted": 0.6287878787878788,
1440
+ "eval_runtime": 2.9665,
1441
+ "eval_samples_per_second": 44.497,
1442
+ "eval_steps_per_second": 5.731,
1443
  "step": 348
1444
  },
1445
  {
1446
  "epoch": 12.068965517241379,
1447
+ "grad_norm": 13.315044403076172,
1448
  "learning_rate": 7.758620689655173e-05,
1449
+ "loss": 0.6786,
1450
  "step": 350
1451
  },
1452
  {
1453
  "epoch": 12.137931034482758,
1454
+ "grad_norm": 7.0229034423828125,
1455
  "learning_rate": 7.739463601532568e-05,
1456
+ "loss": 0.4346,
1457
  "step": 352
1458
  },
1459
  {
1460
  "epoch": 12.206896551724139,
1461
+ "grad_norm": 9.44849967956543,
1462
  "learning_rate": 7.720306513409961e-05,
1463
+ "loss": 0.4842,
1464
  "step": 354
1465
  },
1466
  {
1467
  "epoch": 12.275862068965518,
1468
+ "grad_norm": 2.0537302494049072,
1469
  "learning_rate": 7.701149425287356e-05,
1470
+ "loss": 0.0921,
1471
  "step": 356
1472
  },
1473
  {
1474
  "epoch": 12.344827586206897,
1475
+ "grad_norm": 7.648636341094971,
1476
  "learning_rate": 7.681992337164752e-05,
1477
+ "loss": 0.4003,
1478
  "step": 358
1479
  },
1480
  {
1481
  "epoch": 12.413793103448276,
1482
+ "grad_norm": 8.219009399414062,
1483
  "learning_rate": 7.662835249042147e-05,
1484
+ "loss": 0.57,
1485
  "step": 360
1486
  },
1487
  {
1488
  "epoch": 12.482758620689655,
1489
+ "grad_norm": 8.560858726501465,
1490
  "learning_rate": 7.64367816091954e-05,
1491
+ "loss": 0.2207,
1492
  "step": 362
1493
  },
1494
  {
1495
  "epoch": 12.551724137931034,
1496
+ "grad_norm": 4.989181995391846,
1497
  "learning_rate": 7.624521072796935e-05,
1498
+ "loss": 0.1446,
1499
  "step": 364
1500
  },
1501
  {
1502
  "epoch": 12.620689655172415,
1503
+ "grad_norm": 5.454369068145752,
1504
  "learning_rate": 7.60536398467433e-05,
1505
+ "loss": 0.6619,
1506
  "step": 366
1507
  },
1508
  {
1509
  "epoch": 12.689655172413794,
1510
+ "grad_norm": 4.803226470947266,
1511
  "learning_rate": 7.586206896551724e-05,
1512
+ "loss": 0.2908,
1513
  "step": 368
1514
  },
1515
  {
1516
  "epoch": 12.758620689655173,
1517
+ "grad_norm": 14.855594635009766,
1518
  "learning_rate": 7.567049808429119e-05,
1519
+ "loss": 0.8779,
1520
  "step": 370
1521
  },
1522
  {
1523
  "epoch": 12.827586206896552,
1524
+ "grad_norm": 6.69237756729126,
1525
  "learning_rate": 7.547892720306514e-05,
1526
+ "loss": 0.2755,
1527
  "step": 372
1528
  },
1529
  {
1530
  "epoch": 12.89655172413793,
1531
+ "grad_norm": 11.00119400024414,
1532
  "learning_rate": 7.528735632183909e-05,
1533
+ "loss": 0.2853,
1534
  "step": 374
1535
  },
1536
  {
1537
  "epoch": 12.96551724137931,
1538
+ "grad_norm": 12.996737480163574,
1539
  "learning_rate": 7.509578544061303e-05,
1540
+ "loss": 0.3835,
1541
  "step": 376
1542
  },
1543
  {
1544
  "epoch": 13.0,
1545
+ "eval_accuracy": 0.5378787878787878,
1546
+ "eval_f1_macro": 0.5317693409365079,
1547
+ "eval_f1_micro": 0.5378787878787878,
1548
+ "eval_f1_weighted": 0.5440018882102533,
1549
+ "eval_loss": 1.8354477882385254,
1550
+ "eval_precision_macro": 0.6396235326975365,
1551
+ "eval_precision_micro": 0.5378787878787878,
1552
+ "eval_precision_weighted": 0.6577377477843236,
1553
+ "eval_recall_macro": 0.5264021164021164,
1554
+ "eval_recall_micro": 0.5378787878787878,
1555
+ "eval_recall_weighted": 0.5378787878787878,
1556
+ "eval_runtime": 2.966,
1557
+ "eval_samples_per_second": 44.504,
1558
+ "eval_steps_per_second": 5.732,
1559
  "step": 377
1560
  },
1561
  {
1562
+ "epoch": 13.0,
1563
+ "step": 377,
1564
+ "total_flos": 1.6452764844550595e+18,
1565
+ "train_loss": 1.068754496403651,
1566
+ "train_runtime": 506.3845,
1567
+ "train_samples_per_second": 36.494,
1568
+ "train_steps_per_second": 2.291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1569
  }
1570
  ],
1571
  "logging_steps": 2,
 
1576
  "stateful_callbacks": {
1577
  "EarlyStoppingCallback": {
1578
  "args": {
1579
+ "early_stopping_patience": 8,
1580
  "early_stopping_threshold": 0.0
1581
  },
1582
  "attributes": {
1583
+ "early_stopping_patience_counter": 8
1584
  }
1585
  },
1586
  "TrainerControl": {
 
1594
  "attributes": {}
1595
  }
1596
  },
1597
+ "total_flos": 1.6452764844550595e+18,
1598
  "train_batch_size": 8,
1599
  "trial_name": null,
1600
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dc21886c3b6c37cb63ef3663e12b5d93a033b72ef9aca441a35317dd3a6ba36
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b2b32deb94bfd846b2c9efdfeba4bb55ba632e4f045fd94493f234b59c88c4b
3
  size 5368