ZipperDeng commited on
Commit
cd63b02
·
verified ·
1 Parent(s): 7c1f527

End of training

Browse files
README.md ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: mit
4
+ base_model: TencentGameMate/chinese-hubert-base
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: hubert-base-ser
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # hubert-base-ser
18
+
19
+ This model is a fine-tuned version of [TencentGameMate/chinese-hubert-base](https://huggingface.co/TencentGameMate/chinese-hubert-base) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.1466
22
+ - Accuracy: 0.9526
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 0.0001
42
+ - train_batch_size: 32
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - gradient_accumulation_steps: 2
46
+ - total_train_batch_size: 64
47
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: linear
49
+ - num_epochs: 1.0
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
55
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
56
+ | 0.9709 | 0.0229 | 10 | 0.8923 | 0.6399 |
57
+ | 0.9219 | 0.0457 | 20 | 0.6903 | 0.7664 |
58
+ | 0.7112 | 0.0686 | 30 | 0.5838 | 0.7909 |
59
+ | 0.567 | 0.0914 | 40 | 0.5405 | 0.8159 |
60
+ | 0.6184 | 0.1143 | 50 | 0.4148 | 0.8581 |
61
+ | 0.5291 | 0.1371 | 60 | 0.4444 | 0.8511 |
62
+ | 0.533 | 0.16 | 70 | 0.4643 | 0.8271 |
63
+ | 0.4753 | 0.1829 | 80 | 0.3560 | 0.8767 |
64
+ | 0.4252 | 0.2057 | 90 | 0.5889 | 0.8103 |
65
+ | 0.5007 | 0.2286 | 100 | 0.3882 | 0.8663 |
66
+ | 0.5605 | 0.2514 | 110 | 0.3221 | 0.8921 |
67
+ | 0.4875 | 0.2743 | 120 | 0.3639 | 0.8559 |
68
+ | 0.4277 | 0.2971 | 130 | 0.3571 | 0.8746 |
69
+ | 0.3415 | 0.32 | 140 | 0.3382 | 0.8861 |
70
+ | 0.413 | 0.3429 | 150 | 0.2596 | 0.9104 |
71
+ | 0.377 | 0.3657 | 160 | 0.3519 | 0.8711 |
72
+ | 0.4219 | 0.3886 | 170 | 0.2979 | 0.8947 |
73
+ | 0.3317 | 0.4114 | 180 | 0.2227 | 0.9226 |
74
+ | 0.3131 | 0.4343 | 190 | 0.3680 | 0.8693 |
75
+ | 0.3266 | 0.4571 | 200 | 0.2098 | 0.9309 |
76
+ | 0.3306 | 0.48 | 210 | 0.3849 | 0.8824 |
77
+ | 0.3037 | 0.5029 | 220 | 0.2852 | 0.9024 |
78
+ | 0.3086 | 0.5257 | 230 | 0.2725 | 0.9121 |
79
+ | 0.2576 | 0.5486 | 240 | 0.1869 | 0.9356 |
80
+ | 0.2469 | 0.5714 | 250 | 0.2262 | 0.9243 |
81
+ | 0.2405 | 0.5943 | 260 | 0.1963 | 0.9347 |
82
+ | 0.2802 | 0.6171 | 270 | 0.3680 | 0.8804 |
83
+ | 0.2442 | 0.64 | 280 | 0.2053 | 0.9293 |
84
+ | 0.2302 | 0.6629 | 290 | 0.3356 | 0.8967 |
85
+ | 0.2492 | 0.6857 | 300 | 0.1880 | 0.9371 |
86
+ | 0.2089 | 0.7086 | 310 | 0.2076 | 0.9289 |
87
+ | 0.2824 | 0.7314 | 320 | 0.1999 | 0.9301 |
88
+ | 0.2009 | 0.7543 | 330 | 0.1492 | 0.9521 |
89
+ | 0.2001 | 0.7771 | 340 | 0.1496 | 0.9517 |
90
+ | 0.2298 | 0.8 | 350 | 0.1579 | 0.9490 |
91
+ | 0.1802 | 0.8229 | 360 | 0.1506 | 0.9501 |
92
+ | 0.1914 | 0.8457 | 370 | 0.2036 | 0.9311 |
93
+ | 0.1897 | 0.8686 | 380 | 0.1838 | 0.9383 |
94
+ | 0.1203 | 0.8914 | 390 | 0.1459 | 0.9504 |
95
+ | 0.1372 | 0.9143 | 400 | 0.1748 | 0.9419 |
96
+ | 0.1942 | 0.9371 | 410 | 0.1813 | 0.9406 |
97
+ | 0.1886 | 0.96 | 420 | 0.1536 | 0.9510 |
98
+ | 0.1872 | 0.9829 | 430 | 0.1466 | 0.9526 |
99
+
100
+
101
+ ### Framework versions
102
+
103
+ - Transformers 4.47.0
104
+ - Pytorch 2.4.1+cu118
105
+ - Datasets 3.6.0
106
+ - Tokenizers 0.21.0
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9988571428571429,
3
+ "total_flos": 1.3128537437918904e+18,
4
+ "train_loss": 0.3557066834218442,
5
+ "train_runtime": 12202.3201,
6
+ "train_samples": 28000,
7
+ "train_samples_per_second": 2.295,
8
+ "train_steps_per_second": 0.036
9
+ }
config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "TencentGameMate/chinese-hubert-base",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertForSpeechClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": true,
47
+ "final_dropout": 0.1,
48
+ "finetuning_task": "wav2vec2_clf",
49
+ "hidden_act": "gelu",
50
+ "hidden_dropout": 0.1,
51
+ "hidden_size": 768,
52
+ "id2label": {
53
+ "0": "Angry",
54
+ "1": "Happy",
55
+ "2": "Neutral",
56
+ "3": "Sad",
57
+ "4": "Surprise"
58
+ },
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 3072,
61
+ "label2id": {
62
+ "Angry": 0,
63
+ "Happy": 1,
64
+ "Neutral": 2,
65
+ "Sad": 3,
66
+ "Surprise": 4
67
+ },
68
+ "layer_norm_eps": 1e-05,
69
+ "layerdrop": 0.1,
70
+ "mask_feature_length": 10,
71
+ "mask_feature_min_masks": 0,
72
+ "mask_feature_prob": 0.0,
73
+ "mask_time_length": 10,
74
+ "mask_time_min_masks": 2,
75
+ "mask_time_prob": 0.05,
76
+ "model_type": "hubert",
77
+ "num_attention_heads": 12,
78
+ "num_conv_pos_embedding_groups": 16,
79
+ "num_conv_pos_embeddings": 128,
80
+ "num_feat_extract_layers": 7,
81
+ "num_hidden_layers": 12,
82
+ "pad_token_id": 0,
83
+ "pooling_mode": "mean",
84
+ "problem_type": "single_label_classification",
85
+ "torch_dtype": "float32",
86
+ "transformers_version": "4.47.0",
87
+ "use_weighted_layer_sum": false,
88
+ "vocab_size": 32
89
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d57dffd9609c1029aee07cfb2f7d6a796d411ae54bd6ea5911681dfca89f2e4d
3
+ size 379890236
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
runs/Aug05_18-01-01_dengzhipeng-pc/events.out.tfevents.1754388064.dengzhipeng-pc.22996.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d5aeda078dd282aeb2c10ce9b70fb8eb6cdbafc827615b025ee77e80dae37ae
3
+ size 6161
runs/Aug05_18-01-01_dengzhipeng-pc/events.out.tfevents.1754388350.dengzhipeng-pc.22996.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70efb564af3877bab96664e175bc42e28eedf803a61377c4ab2be965171f4af7
3
+ size 6161
runs/Aug05_18-01-01_dengzhipeng-pc/events.out.tfevents.1754388504.dengzhipeng-pc.22996.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:929ef6291819fb5ea9c0bb11387abc37d167e01be481804bac0049f206b23ab2
3
+ size 6161
runs/Aug05_18-01-01_dengzhipeng-pc/events.out.tfevents.1754390203.dengzhipeng-pc.22996.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b932e4c4fdcc196e288297a854b39fc0426b3cfa69c2443d5a6124de658cba4
3
+ size 6210
runs/Aug05_18-01-01_dengzhipeng-pc/events.out.tfevents.1754390300.dengzhipeng-pc.22996.4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a13f7f5fd06a3cf5ec60bb1483293d6b7928f33dcc9d56febec1401a63fff7e5
3
+ size 6210
runs/Aug05_18-44-57_dengzhipeng-pc/events.out.tfevents.1754390714.dengzhipeng-pc.22996.5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea7e7d9d9726a643450a0b577939a86c5e1947fece6c0c4084cca8300ec28422
3
+ size 6211
runs/Aug05_18-46-12_dengzhipeng-pc/events.out.tfevents.1754390772.dengzhipeng-pc.22996.6 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:859cd8c2c9fcf9cee046ab92346063de05c23edd09bb4244de477447ff938423
3
+ size 6211
runs/Aug05_19-09-27_dengzhipeng-pc/events.out.tfevents.1754392167.dengzhipeng-pc.22996.7 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47b5ad34997c78343e2065f694947c60707f9bac9bebb801f8585940e3defe0f
3
+ size 6210
runs/Aug05_19-09-27_dengzhipeng-pc/events.out.tfevents.1754392276.dengzhipeng-pc.22996.8 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54043277e473862bfecd954d6916cc6bf6d3c5fce985b4ce9bd94f3d7fd4ea40
3
+ size 7989
runs/Aug05_19-09-27_dengzhipeng-pc/events.out.tfevents.1754393216.dengzhipeng-pc.22996.9 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a11e88fbe40ca4ac8d4668f97a81e41c16d6331c785614d2538fc765db77ca84
3
+ size 10587
runs/Aug05_20-02-33_dengzhipeng-pc/events.out.tfevents.1754395354.dengzhipeng-pc.22996.10 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4be4563dbc6148f2f85c1677e3b813bee42047b57fba848cdcbe2e6392c757c
3
+ size 29385
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9988571428571429,
3
+ "total_flos": 1.3128537437918904e+18,
4
+ "train_loss": 0.3557066834218442,
5
+ "train_runtime": 12202.3201,
6
+ "train_samples": 28000,
7
+ "train_samples_per_second": 2.295,
8
+ "train_steps_per_second": 0.036
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,730 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9988571428571429,
5
+ "eval_steps": 10,
6
+ "global_step": 437,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.022857142857142857,
13
+ "grad_norm": 6.883157253265381,
14
+ "learning_rate": 9.77116704805492e-05,
15
+ "loss": 0.9709,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.022857142857142857,
20
+ "eval_accuracy": 0.6398571133613586,
21
+ "eval_loss": 0.8923419117927551,
22
+ "eval_runtime": 252.6626,
23
+ "eval_samples_per_second": 27.705,
24
+ "eval_steps_per_second": 6.926,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.045714285714285714,
29
+ "grad_norm": 4.793847560882568,
30
+ "learning_rate": 9.542334096109841e-05,
31
+ "loss": 0.9219,
32
+ "step": 20
33
+ },
34
+ {
35
+ "epoch": 0.045714285714285714,
36
+ "eval_accuracy": 0.7664285898208618,
37
+ "eval_loss": 0.6903320550918579,
38
+ "eval_runtime": 260.5483,
39
+ "eval_samples_per_second": 26.866,
40
+ "eval_steps_per_second": 6.717,
41
+ "step": 20
42
+ },
43
+ {
44
+ "epoch": 0.06857142857142857,
45
+ "grad_norm": 6.191551685333252,
46
+ "learning_rate": 9.31350114416476e-05,
47
+ "loss": 0.7112,
48
+ "step": 30
49
+ },
50
+ {
51
+ "epoch": 0.06857142857142857,
52
+ "eval_accuracy": 0.7908571362495422,
53
+ "eval_loss": 0.5838488936424255,
54
+ "eval_runtime": 254.6091,
55
+ "eval_samples_per_second": 27.493,
56
+ "eval_steps_per_second": 6.873,
57
+ "step": 30
58
+ },
59
+ {
60
+ "epoch": 0.09142857142857143,
61
+ "grad_norm": 9.833272933959961,
62
+ "learning_rate": 9.08466819221968e-05,
63
+ "loss": 0.567,
64
+ "step": 40
65
+ },
66
+ {
67
+ "epoch": 0.09142857142857143,
68
+ "eval_accuracy": 0.8158571720123291,
69
+ "eval_loss": 0.5405334830284119,
70
+ "eval_runtime": 263.3184,
71
+ "eval_samples_per_second": 26.584,
72
+ "eval_steps_per_second": 6.646,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.11428571428571428,
77
+ "grad_norm": 9.925666809082031,
78
+ "learning_rate": 8.878718535469108e-05,
79
+ "loss": 0.6184,
80
+ "step": 50
81
+ },
82
+ {
83
+ "epoch": 0.11428571428571428,
84
+ "eval_accuracy": 0.8581428527832031,
85
+ "eval_loss": 0.41476812958717346,
86
+ "eval_runtime": 259.1036,
87
+ "eval_samples_per_second": 27.016,
88
+ "eval_steps_per_second": 6.754,
89
+ "step": 50
90
+ },
91
+ {
92
+ "epoch": 0.13714285714285715,
93
+ "grad_norm": 3.723980665206909,
94
+ "learning_rate": 8.649885583524028e-05,
95
+ "loss": 0.5291,
96
+ "step": 60
97
+ },
98
+ {
99
+ "epoch": 0.13714285714285715,
100
+ "eval_accuracy": 0.8511428833007812,
101
+ "eval_loss": 0.44439756870269775,
102
+ "eval_runtime": 253.5826,
103
+ "eval_samples_per_second": 27.604,
104
+ "eval_steps_per_second": 6.901,
105
+ "step": 60
106
+ },
107
+ {
108
+ "epoch": 0.16,
109
+ "grad_norm": 10.508088111877441,
110
+ "learning_rate": 8.421052631578948e-05,
111
+ "loss": 0.533,
112
+ "step": 70
113
+ },
114
+ {
115
+ "epoch": 0.16,
116
+ "eval_accuracy": 0.8271428346633911,
117
+ "eval_loss": 0.4642958641052246,
118
+ "eval_runtime": 260.9488,
119
+ "eval_samples_per_second": 26.825,
120
+ "eval_steps_per_second": 6.706,
121
+ "step": 70
122
+ },
123
+ {
124
+ "epoch": 0.18285714285714286,
125
+ "grad_norm": 7.824756622314453,
126
+ "learning_rate": 8.192219679633868e-05,
127
+ "loss": 0.4753,
128
+ "step": 80
129
+ },
130
+ {
131
+ "epoch": 0.18285714285714286,
132
+ "eval_accuracy": 0.876714289188385,
133
+ "eval_loss": 0.35598087310791016,
134
+ "eval_runtime": 262.7831,
135
+ "eval_samples_per_second": 26.638,
136
+ "eval_steps_per_second": 6.659,
137
+ "step": 80
138
+ },
139
+ {
140
+ "epoch": 0.2057142857142857,
141
+ "grad_norm": 5.332316875457764,
142
+ "learning_rate": 7.963386727688788e-05,
143
+ "loss": 0.4252,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 0.2057142857142857,
148
+ "eval_accuracy": 0.8102856874465942,
149
+ "eval_loss": 0.5888535380363464,
150
+ "eval_runtime": 262.7552,
151
+ "eval_samples_per_second": 26.641,
152
+ "eval_steps_per_second": 6.66,
153
+ "step": 90
154
+ },
155
+ {
156
+ "epoch": 0.22857142857142856,
157
+ "grad_norm": 17.482688903808594,
158
+ "learning_rate": 7.734553775743708e-05,
159
+ "loss": 0.5007,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.22857142857142856,
164
+ "eval_accuracy": 0.8662857413291931,
165
+ "eval_loss": 0.38821107149124146,
166
+ "eval_runtime": 261.4572,
167
+ "eval_samples_per_second": 26.773,
168
+ "eval_steps_per_second": 6.693,
169
+ "step": 100
170
+ },
171
+ {
172
+ "epoch": 0.25142857142857145,
173
+ "grad_norm": 8.691084861755371,
174
+ "learning_rate": 7.505720823798627e-05,
175
+ "loss": 0.5605,
176
+ "step": 110
177
+ },
178
+ {
179
+ "epoch": 0.25142857142857145,
180
+ "eval_accuracy": 0.8921428322792053,
181
+ "eval_loss": 0.32210296392440796,
182
+ "eval_runtime": 261.1514,
183
+ "eval_samples_per_second": 26.804,
184
+ "eval_steps_per_second": 6.701,
185
+ "step": 110
186
+ },
187
+ {
188
+ "epoch": 0.2742857142857143,
189
+ "grad_norm": 11.754142761230469,
190
+ "learning_rate": 7.276887871853547e-05,
191
+ "loss": 0.4875,
192
+ "step": 120
193
+ },
194
+ {
195
+ "epoch": 0.2742857142857143,
196
+ "eval_accuracy": 0.8558571338653564,
197
+ "eval_loss": 0.36388570070266724,
198
+ "eval_runtime": 265.2182,
199
+ "eval_samples_per_second": 26.393,
200
+ "eval_steps_per_second": 6.598,
201
+ "step": 120
202
+ },
203
+ {
204
+ "epoch": 0.29714285714285715,
205
+ "grad_norm": 7.222925662994385,
206
+ "learning_rate": 7.048054919908466e-05,
207
+ "loss": 0.4277,
208
+ "step": 130
209
+ },
210
+ {
211
+ "epoch": 0.29714285714285715,
212
+ "eval_accuracy": 0.8745714426040649,
213
+ "eval_loss": 0.35708051919937134,
214
+ "eval_runtime": 264.6016,
215
+ "eval_samples_per_second": 26.455,
216
+ "eval_steps_per_second": 6.614,
217
+ "step": 130
218
+ },
219
+ {
220
+ "epoch": 0.32,
221
+ "grad_norm": 6.181695938110352,
222
+ "learning_rate": 6.819221967963387e-05,
223
+ "loss": 0.3415,
224
+ "step": 140
225
+ },
226
+ {
227
+ "epoch": 0.32,
228
+ "eval_accuracy": 0.8861428499221802,
229
+ "eval_loss": 0.33818891644477844,
230
+ "eval_runtime": 262.5039,
231
+ "eval_samples_per_second": 26.666,
232
+ "eval_steps_per_second": 6.667,
233
+ "step": 140
234
+ },
235
+ {
236
+ "epoch": 0.34285714285714286,
237
+ "grad_norm": 8.087543487548828,
238
+ "learning_rate": 6.590389016018307e-05,
239
+ "loss": 0.413,
240
+ "step": 150
241
+ },
242
+ {
243
+ "epoch": 0.34285714285714286,
244
+ "eval_accuracy": 0.9104285836219788,
245
+ "eval_loss": 0.2596481442451477,
246
+ "eval_runtime": 265.6837,
247
+ "eval_samples_per_second": 26.347,
248
+ "eval_steps_per_second": 6.587,
249
+ "step": 150
250
+ },
251
+ {
252
+ "epoch": 0.3657142857142857,
253
+ "grad_norm": 11.313796997070312,
254
+ "learning_rate": 6.361556064073226e-05,
255
+ "loss": 0.377,
256
+ "step": 160
257
+ },
258
+ {
259
+ "epoch": 0.3657142857142857,
260
+ "eval_accuracy": 0.8711428642272949,
261
+ "eval_loss": 0.3518799841403961,
262
+ "eval_runtime": 264.3798,
263
+ "eval_samples_per_second": 26.477,
264
+ "eval_steps_per_second": 6.619,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.38857142857142857,
269
+ "grad_norm": 7.65640115737915,
270
+ "learning_rate": 6.132723112128147e-05,
271
+ "loss": 0.4219,
272
+ "step": 170
273
+ },
274
+ {
275
+ "epoch": 0.38857142857142857,
276
+ "eval_accuracy": 0.8947142958641052,
277
+ "eval_loss": 0.2979215681552887,
278
+ "eval_runtime": 262.8341,
279
+ "eval_samples_per_second": 26.633,
280
+ "eval_steps_per_second": 6.658,
281
+ "step": 170
282
+ },
283
+ {
284
+ "epoch": 0.4114285714285714,
285
+ "grad_norm": 6.2714433670043945,
286
+ "learning_rate": 5.903890160183066e-05,
287
+ "loss": 0.3317,
288
+ "step": 180
289
+ },
290
+ {
291
+ "epoch": 0.4114285714285714,
292
+ "eval_accuracy": 0.9225714206695557,
293
+ "eval_loss": 0.22266168892383575,
294
+ "eval_runtime": 265.1248,
295
+ "eval_samples_per_second": 26.403,
296
+ "eval_steps_per_second": 6.601,
297
+ "step": 180
298
+ },
299
+ {
300
+ "epoch": 0.4342857142857143,
301
+ "grad_norm": 8.710111618041992,
302
+ "learning_rate": 5.675057208237986e-05,
303
+ "loss": 0.3131,
304
+ "step": 190
305
+ },
306
+ {
307
+ "epoch": 0.4342857142857143,
308
+ "eval_accuracy": 0.8692857027053833,
309
+ "eval_loss": 0.3680011034011841,
310
+ "eval_runtime": 260.0056,
311
+ "eval_samples_per_second": 26.923,
312
+ "eval_steps_per_second": 6.731,
313
+ "step": 190
314
+ },
315
+ {
316
+ "epoch": 0.45714285714285713,
317
+ "grad_norm": 4.041360378265381,
318
+ "learning_rate": 5.446224256292907e-05,
319
+ "loss": 0.3266,
320
+ "step": 200
321
+ },
322
+ {
323
+ "epoch": 0.45714285714285713,
324
+ "eval_accuracy": 0.9308571219444275,
325
+ "eval_loss": 0.20981180667877197,
326
+ "eval_runtime": 256.153,
327
+ "eval_samples_per_second": 27.327,
328
+ "eval_steps_per_second": 6.832,
329
+ "step": 200
330
+ },
331
+ {
332
+ "epoch": 0.48,
333
+ "grad_norm": 10.932918548583984,
334
+ "learning_rate": 5.217391304347826e-05,
335
+ "loss": 0.3306,
336
+ "step": 210
337
+ },
338
+ {
339
+ "epoch": 0.48,
340
+ "eval_accuracy": 0.8824285864830017,
341
+ "eval_loss": 0.3848917782306671,
342
+ "eval_runtime": 253.9958,
343
+ "eval_samples_per_second": 27.56,
344
+ "eval_steps_per_second": 6.89,
345
+ "step": 210
346
+ },
347
+ {
348
+ "epoch": 0.5028571428571429,
349
+ "grad_norm": 9.440160751342773,
350
+ "learning_rate": 4.9885583524027466e-05,
351
+ "loss": 0.3037,
352
+ "step": 220
353
+ },
354
+ {
355
+ "epoch": 0.5028571428571429,
356
+ "eval_accuracy": 0.9024285674095154,
357
+ "eval_loss": 0.28518444299697876,
358
+ "eval_runtime": 259.3612,
359
+ "eval_samples_per_second": 26.989,
360
+ "eval_steps_per_second": 6.747,
361
+ "step": 220
362
+ },
363
+ {
364
+ "epoch": 0.5257142857142857,
365
+ "grad_norm": 9.196854591369629,
366
+ "learning_rate": 4.759725400457666e-05,
367
+ "loss": 0.3086,
368
+ "step": 230
369
+ },
370
+ {
371
+ "epoch": 0.5257142857142857,
372
+ "eval_accuracy": 0.9121428728103638,
373
+ "eval_loss": 0.272481232881546,
374
+ "eval_runtime": 254.9581,
375
+ "eval_samples_per_second": 27.455,
376
+ "eval_steps_per_second": 6.864,
377
+ "step": 230
378
+ },
379
+ {
380
+ "epoch": 0.5485714285714286,
381
+ "grad_norm": 6.610895156860352,
382
+ "learning_rate": 4.530892448512586e-05,
383
+ "loss": 0.2576,
384
+ "step": 240
385
+ },
386
+ {
387
+ "epoch": 0.5485714285714286,
388
+ "eval_accuracy": 0.9355714321136475,
389
+ "eval_loss": 0.18688350915908813,
390
+ "eval_runtime": 255.2292,
391
+ "eval_samples_per_second": 27.426,
392
+ "eval_steps_per_second": 6.857,
393
+ "step": 240
394
+ },
395
+ {
396
+ "epoch": 0.5714285714285714,
397
+ "grad_norm": 15.24905014038086,
398
+ "learning_rate": 4.302059496567506e-05,
399
+ "loss": 0.2469,
400
+ "step": 250
401
+ },
402
+ {
403
+ "epoch": 0.5714285714285714,
404
+ "eval_accuracy": 0.9242857098579407,
405
+ "eval_loss": 0.2262311726808548,
406
+ "eval_runtime": 254.9064,
407
+ "eval_samples_per_second": 27.461,
408
+ "eval_steps_per_second": 6.865,
409
+ "step": 250
410
+ },
411
+ {
412
+ "epoch": 0.5942857142857143,
413
+ "grad_norm": 9.8357515335083,
414
+ "learning_rate": 4.073226544622426e-05,
415
+ "loss": 0.2405,
416
+ "step": 260
417
+ },
418
+ {
419
+ "epoch": 0.5942857142857143,
420
+ "eval_accuracy": 0.9347142577171326,
421
+ "eval_loss": 0.19631564617156982,
422
+ "eval_runtime": 271.1966,
423
+ "eval_samples_per_second": 25.812,
424
+ "eval_steps_per_second": 6.453,
425
+ "step": 260
426
+ },
427
+ {
428
+ "epoch": 0.6171428571428571,
429
+ "grad_norm": 19.872060775756836,
430
+ "learning_rate": 3.844393592677346e-05,
431
+ "loss": 0.2802,
432
+ "step": 270
433
+ },
434
+ {
435
+ "epoch": 0.6171428571428571,
436
+ "eval_accuracy": 0.8804285526275635,
437
+ "eval_loss": 0.3679888844490051,
438
+ "eval_runtime": 256.0669,
439
+ "eval_samples_per_second": 27.337,
440
+ "eval_steps_per_second": 6.834,
441
+ "step": 270
442
+ },
443
+ {
444
+ "epoch": 0.64,
445
+ "grad_norm": 3.6445915699005127,
446
+ "learning_rate": 3.6155606407322653e-05,
447
+ "loss": 0.2442,
448
+ "step": 280
449
+ },
450
+ {
451
+ "epoch": 0.64,
452
+ "eval_accuracy": 0.9292857050895691,
453
+ "eval_loss": 0.20533673465251923,
454
+ "eval_runtime": 255.7952,
455
+ "eval_samples_per_second": 27.366,
456
+ "eval_steps_per_second": 6.841,
457
+ "step": 280
458
+ },
459
+ {
460
+ "epoch": 0.6628571428571428,
461
+ "grad_norm": 8.114418983459473,
462
+ "learning_rate": 3.3867276887871856e-05,
463
+ "loss": 0.2302,
464
+ "step": 290
465
+ },
466
+ {
467
+ "epoch": 0.6628571428571428,
468
+ "eval_accuracy": 0.8967142701148987,
469
+ "eval_loss": 0.3355866074562073,
470
+ "eval_runtime": 257.891,
471
+ "eval_samples_per_second": 27.143,
472
+ "eval_steps_per_second": 6.786,
473
+ "step": 290
474
+ },
475
+ {
476
+ "epoch": 0.6857142857142857,
477
+ "grad_norm": 5.993322372436523,
478
+ "learning_rate": 3.157894736842105e-05,
479
+ "loss": 0.2492,
480
+ "step": 300
481
+ },
482
+ {
483
+ "epoch": 0.6857142857142857,
484
+ "eval_accuracy": 0.9371428489685059,
485
+ "eval_loss": 0.18795913457870483,
486
+ "eval_runtime": 254.5882,
487
+ "eval_samples_per_second": 27.495,
488
+ "eval_steps_per_second": 6.874,
489
+ "step": 300
490
+ },
491
+ {
492
+ "epoch": 0.7085714285714285,
493
+ "grad_norm": 6.529418468475342,
494
+ "learning_rate": 2.9290617848970254e-05,
495
+ "loss": 0.2089,
496
+ "step": 310
497
+ },
498
+ {
499
+ "epoch": 0.7085714285714285,
500
+ "eval_accuracy": 0.928857147693634,
501
+ "eval_loss": 0.2076321393251419,
502
+ "eval_runtime": 260.5938,
503
+ "eval_samples_per_second": 26.862,
504
+ "eval_steps_per_second": 6.715,
505
+ "step": 310
506
+ },
507
+ {
508
+ "epoch": 0.7314285714285714,
509
+ "grad_norm": 6.433741092681885,
510
+ "learning_rate": 2.7002288329519453e-05,
511
+ "loss": 0.2824,
512
+ "step": 320
513
+ },
514
+ {
515
+ "epoch": 0.7314285714285714,
516
+ "eval_accuracy": 0.930142879486084,
517
+ "eval_loss": 0.1999480277299881,
518
+ "eval_runtime": 255.2396,
519
+ "eval_samples_per_second": 27.425,
520
+ "eval_steps_per_second": 6.856,
521
+ "step": 320
522
+ },
523
+ {
524
+ "epoch": 0.7542857142857143,
525
+ "grad_norm": 5.394837379455566,
526
+ "learning_rate": 2.4713958810068652e-05,
527
+ "loss": 0.2009,
528
+ "step": 330
529
+ },
530
+ {
531
+ "epoch": 0.7542857142857143,
532
+ "eval_accuracy": 0.9521428346633911,
533
+ "eval_loss": 0.14918017387390137,
534
+ "eval_runtime": 258.1497,
535
+ "eval_samples_per_second": 27.116,
536
+ "eval_steps_per_second": 6.779,
537
+ "step": 330
538
+ },
539
+ {
540
+ "epoch": 0.7771428571428571,
541
+ "grad_norm": 5.843348503112793,
542
+ "learning_rate": 2.242562929061785e-05,
543
+ "loss": 0.2001,
544
+ "step": 340
545
+ },
546
+ {
547
+ "epoch": 0.7771428571428571,
548
+ "eval_accuracy": 0.951714277267456,
549
+ "eval_loss": 0.14960123598575592,
550
+ "eval_runtime": 253.1262,
551
+ "eval_samples_per_second": 27.654,
552
+ "eval_steps_per_second": 6.914,
553
+ "step": 340
554
+ },
555
+ {
556
+ "epoch": 0.8,
557
+ "grad_norm": 7.778473377227783,
558
+ "learning_rate": 2.0137299771167047e-05,
559
+ "loss": 0.2298,
560
+ "step": 350
561
+ },
562
+ {
563
+ "epoch": 0.8,
564
+ "eval_accuracy": 0.9490000009536743,
565
+ "eval_loss": 0.15794885158538818,
566
+ "eval_runtime": 258.4154,
567
+ "eval_samples_per_second": 27.088,
568
+ "eval_steps_per_second": 6.772,
569
+ "step": 350
570
+ },
571
+ {
572
+ "epoch": 0.8228571428571428,
573
+ "grad_norm": 7.672749042510986,
574
+ "learning_rate": 1.784897025171625e-05,
575
+ "loss": 0.1802,
576
+ "step": 360
577
+ },
578
+ {
579
+ "epoch": 0.8228571428571428,
580
+ "eval_accuracy": 0.9501428604125977,
581
+ "eval_loss": 0.15056686103343964,
582
+ "eval_runtime": 253.0586,
583
+ "eval_samples_per_second": 27.662,
584
+ "eval_steps_per_second": 6.915,
585
+ "step": 360
586
+ },
587
+ {
588
+ "epoch": 0.8457142857142858,
589
+ "grad_norm": 7.994875431060791,
590
+ "learning_rate": 1.5560640732265445e-05,
591
+ "loss": 0.1914,
592
+ "step": 370
593
+ },
594
+ {
595
+ "epoch": 0.8457142857142858,
596
+ "eval_accuracy": 0.9311428666114807,
597
+ "eval_loss": 0.20363783836364746,
598
+ "eval_runtime": 261.3379,
599
+ "eval_samples_per_second": 26.785,
600
+ "eval_steps_per_second": 6.696,
601
+ "step": 370
602
+ },
603
+ {
604
+ "epoch": 0.8685714285714285,
605
+ "grad_norm": 3.988149404525757,
606
+ "learning_rate": 1.3272311212814645e-05,
607
+ "loss": 0.1897,
608
+ "step": 380
609
+ },
610
+ {
611
+ "epoch": 0.8685714285714285,
612
+ "eval_accuracy": 0.9382857084274292,
613
+ "eval_loss": 0.18375040590763092,
614
+ "eval_runtime": 256.8539,
615
+ "eval_samples_per_second": 27.253,
616
+ "eval_steps_per_second": 6.813,
617
+ "step": 380
618
+ },
619
+ {
620
+ "epoch": 0.8914285714285715,
621
+ "grad_norm": 7.280108451843262,
622
+ "learning_rate": 1.0983981693363844e-05,
623
+ "loss": 0.1203,
624
+ "step": 390
625
+ },
626
+ {
627
+ "epoch": 0.8914285714285715,
628
+ "eval_accuracy": 0.9504285454750061,
629
+ "eval_loss": 0.1459112912416458,
630
+ "eval_runtime": 256.3941,
631
+ "eval_samples_per_second": 27.302,
632
+ "eval_steps_per_second": 6.825,
633
+ "step": 390
634
+ },
635
+ {
636
+ "epoch": 0.9142857142857143,
637
+ "grad_norm": 6.386229991912842,
638
+ "learning_rate": 8.695652173913044e-06,
639
+ "loss": 0.1372,
640
+ "step": 400
641
+ },
642
+ {
643
+ "epoch": 0.9142857142857143,
644
+ "eval_accuracy": 0.9418571591377258,
645
+ "eval_loss": 0.1748434156179428,
646
+ "eval_runtime": 266.7645,
647
+ "eval_samples_per_second": 26.24,
648
+ "eval_steps_per_second": 6.56,
649
+ "step": 400
650
+ },
651
+ {
652
+ "epoch": 0.9371428571428572,
653
+ "grad_norm": 7.714508056640625,
654
+ "learning_rate": 6.407322654462243e-06,
655
+ "loss": 0.1942,
656
+ "step": 410
657
+ },
658
+ {
659
+ "epoch": 0.9371428571428572,
660
+ "eval_accuracy": 0.9405714273452759,
661
+ "eval_loss": 0.18131674826145172,
662
+ "eval_runtime": 266.6389,
663
+ "eval_samples_per_second": 26.253,
664
+ "eval_steps_per_second": 6.563,
665
+ "step": 410
666
+ },
667
+ {
668
+ "epoch": 0.96,
669
+ "grad_norm": 4.493211269378662,
670
+ "learning_rate": 4.118993135011442e-06,
671
+ "loss": 0.1886,
672
+ "step": 420
673
+ },
674
+ {
675
+ "epoch": 0.96,
676
+ "eval_accuracy": 0.9509999752044678,
677
+ "eval_loss": 0.15357272326946259,
678
+ "eval_runtime": 273.0321,
679
+ "eval_samples_per_second": 25.638,
680
+ "eval_steps_per_second": 6.41,
681
+ "step": 420
682
+ },
683
+ {
684
+ "epoch": 0.9828571428571429,
685
+ "grad_norm": 4.66563606262207,
686
+ "learning_rate": 1.8306636155606409e-06,
687
+ "loss": 0.1872,
688
+ "step": 430
689
+ },
690
+ {
691
+ "epoch": 0.9828571428571429,
692
+ "eval_accuracy": 0.952571451663971,
693
+ "eval_loss": 0.1465713381767273,
694
+ "eval_runtime": 266.7172,
695
+ "eval_samples_per_second": 26.245,
696
+ "eval_steps_per_second": 6.561,
697
+ "step": 430
698
+ },
699
+ {
700
+ "epoch": 0.9988571428571429,
701
+ "step": 437,
702
+ "total_flos": 1.3128537437918904e+18,
703
+ "train_loss": 0.3557066834218442,
704
+ "train_runtime": 12202.3201,
705
+ "train_samples_per_second": 2.295,
706
+ "train_steps_per_second": 0.036
707
+ }
708
+ ],
709
+ "logging_steps": 10,
710
+ "max_steps": 437,
711
+ "num_input_tokens_seen": 0,
712
+ "num_train_epochs": 1,
713
+ "save_steps": 10,
714
+ "stateful_callbacks": {
715
+ "TrainerControl": {
716
+ "args": {
717
+ "should_epoch_stop": false,
718
+ "should_evaluate": false,
719
+ "should_log": false,
720
+ "should_save": true,
721
+ "should_training_stop": true
722
+ },
723
+ "attributes": {}
724
+ }
725
+ },
726
+ "total_flos": 1.3128537437918904e+18,
727
+ "train_batch_size": 32,
728
+ "trial_name": null,
729
+ "trial_params": null
730
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b30299245c31111db5b268f0927632aca7ff3e92f02299a9653ecdfa84cdf28c
3
+ size 5368