IRI2070 commited on
Commit
818f6fe
·
verified ·
1 Parent(s): 983aed5

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,58 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: PartAI/TookaBERT-Large
5
+ tags:
6
+ - generated_from_trainer
7
+ metrics:
8
+ - accuracy
9
+ model-index:
10
+ - name: snapfood
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # snapfood
18
+
19
+ This model is a fine-tuned version of [PartAI/TookaBERT-Large](https://huggingface.co/PartAI/TookaBERT-Large) on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.3681
22
+ - Accuracy: 0.8793
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 2e-05
42
+ - train_batch_size: 10
43
+ - eval_batch_size: 8
44
+ - seed: 42
45
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
+ - lr_scheduler_type: linear
47
+ - num_epochs: 3.0
48
+
49
+ ### Training results
50
+
51
+
52
+
53
+ ### Framework versions
54
+
55
+ - Transformers 5.0.0.dev0
56
+ - Pytorch 2.6.0+cu124
57
+ - Datasets 4.4.1
58
+ - Tokenizers 0.22.1
all_results.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.8793330934388869,
4
+ "eval_loss": 0.36814266443252563,
5
+ "eval_runtime": 437.0819,
6
+ "eval_samples": 8337,
7
+ "eval_samples_per_second": 19.074,
8
+ "eval_steps_per_second": 2.386,
9
+ "total_flos": 1.4568883002906624e+17,
10
+ "train_loss": 0.3201524627984222,
11
+ "train_runtime": 31218.1999,
12
+ "train_samples": 52110,
13
+ "train_samples_per_second": 5.008,
14
+ "train_steps_per_second": 0.501
15
+ }
checkpoint-10422/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 2,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 3,
10
+ "finetuning_task": "text-classification",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "id2label": {
15
+ "0": "HAPPY",
16
+ "1": "SAD"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 4096,
20
+ "label2id": {
21
+ "HAPPY": 0,
22
+ "SAD": 1
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "transformers_version": "5.0.0.dev0",
33
+ "type_vocab_size": 2,
34
+ "use_cache": false,
35
+ "vocab_size": 48000
36
+ }
checkpoint-10422/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:570152b4f6618f1475c9c8a864a715f5b0d275752e76b4d44476b3f974aefe0f
3
+ size 1412212656
checkpoint-10422/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67b1ec93309ece0c69b93368042ed1194c59ef938ac7269ee941320dc3978f66
3
+ size 2824660205
checkpoint-10422/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa24153f31489291c0f9ea47ee9cec86b6296b009004a9e32ada3be05dacda76
3
+ size 14244
checkpoint-10422/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d9130bb4c145fcf308e8b2729a7d3ebb81c4a003296f7afb90af4f2984dbfc0
3
+ size 1064
checkpoint-10422/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-10422/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "</s>",
7
+ "is_local": false,
8
+ "mask_token": "<mask>",
9
+ "max_length": null,
10
+ "model_input_names": [
11
+ "input_ids",
12
+ "attention_mask"
13
+ ],
14
+ "model_max_length": 512,
15
+ "pad_to_multiple_of": null,
16
+ "pad_token": "<pad>",
17
+ "pad_token_type_id": 0,
18
+ "padding_side": "right",
19
+ "sep_token": "</s>",
20
+ "tokenizer_class": "TokenizersBackend",
21
+ "truncation_side": "right",
22
+ "unk_token": "<unk>"
23
+ }
checkpoint-10422/trainer_state.json ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 10422,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09595087315294569,
14
+ "grad_norm": 12.811051368713379,
15
+ "learning_rate": 1.936160685728907e-05,
16
+ "loss": 0.435078857421875,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.19190174630589138,
21
+ "grad_norm": 8.909839630126953,
22
+ "learning_rate": 1.8721934369602766e-05,
23
+ "loss": 0.38166162109375,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.28785261945883706,
28
+ "grad_norm": 23.91183853149414,
29
+ "learning_rate": 1.8082261881916462e-05,
30
+ "loss": 0.3671803283691406,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.38380349261178276,
35
+ "grad_norm": 13.278630256652832,
36
+ "learning_rate": 1.7442589394230155e-05,
37
+ "loss": 0.34918722534179686,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.47975436576472846,
42
+ "grad_norm": 6.475880146026611,
43
+ "learning_rate": 1.680291690654385e-05,
44
+ "loss": 0.35130245971679686,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.5757052389176741,
49
+ "grad_norm": 5.953056812286377,
50
+ "learning_rate": 1.6163244418857547e-05,
51
+ "loss": 0.363922607421875,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.6716561120706198,
56
+ "grad_norm": 9.432499885559082,
57
+ "learning_rate": 1.5523571931171243e-05,
58
+ "loss": 0.37142852783203123,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.7676069852235655,
63
+ "grad_norm": 3.409161329269409,
64
+ "learning_rate": 1.4883899443484937e-05,
65
+ "loss": 0.3608262023925781,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.8635578583765112,
70
+ "grad_norm": 1.59526789188385,
71
+ "learning_rate": 1.4244226955798633e-05,
72
+ "loss": 0.3568770446777344,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.9595087315294569,
77
+ "grad_norm": 4.420688629150391,
78
+ "learning_rate": 1.3604554468112327e-05,
79
+ "loss": 0.3340596923828125,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 1.0554596046824025,
84
+ "grad_norm": 8.594019889831543,
85
+ "learning_rate": 1.2964881980426023e-05,
86
+ "loss": 0.3522498779296875,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 1.1514104778353482,
91
+ "grad_norm": 3.242337226867676,
92
+ "learning_rate": 1.2325209492739718e-05,
93
+ "loss": 0.32486505126953125,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 1.247361350988294,
98
+ "grad_norm": 20.167972564697266,
99
+ "learning_rate": 1.1685537005053414e-05,
100
+ "loss": 0.3254352416992187,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 1.3433122241412396,
105
+ "grad_norm": 4.983191967010498,
106
+ "learning_rate": 1.1045864517367108e-05,
107
+ "loss": 0.330927490234375,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 1.4392630972941853,
112
+ "grad_norm": 1.6593592166900635,
113
+ "learning_rate": 1.0406192029680804e-05,
114
+ "loss": 0.3213047485351562,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 1.535213970447131,
119
+ "grad_norm": 17.288793563842773,
120
+ "learning_rate": 9.7665195419945e-06,
121
+ "loss": 0.3227524719238281,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 1.6311648436000767,
126
+ "grad_norm": 3.9145667552948,
127
+ "learning_rate": 9.126847054308194e-06,
128
+ "loss": 0.3189747924804687,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 1.7271157167530224,
133
+ "grad_norm": 3.7356808185577393,
134
+ "learning_rate": 8.48717456662189e-06,
135
+ "loss": 0.33311187744140625,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 1.8230665899059681,
140
+ "grad_norm": 15.458077430725098,
141
+ "learning_rate": 7.847502078935586e-06,
142
+ "loss": 0.3166582946777344,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 1.9190174630589139,
147
+ "grad_norm": 0.5702362060546875,
148
+ "learning_rate": 7.207829591249281e-06,
149
+ "loss": 0.3185113525390625,
150
+ "step": 10000
151
+ }
152
+ ],
153
+ "logging_steps": 500,
154
+ "max_steps": 15633,
155
+ "num_input_tokens_seen": 0,
156
+ "num_train_epochs": 3,
157
+ "save_steps": 500,
158
+ "stateful_callbacks": {
159
+ "TrainerControl": {
160
+ "args": {
161
+ "should_epoch_stop": false,
162
+ "should_evaluate": false,
163
+ "should_log": false,
164
+ "should_save": true,
165
+ "should_training_stop": false
166
+ },
167
+ "attributes": {}
168
+ }
169
+ },
170
+ "total_flos": 9.712588668604416e+16,
171
+ "train_batch_size": 10,
172
+ "trial_name": null,
173
+ "trial_params": null
174
+ }
checkpoint-10422/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7010d8e51577a38b5b304dc130057e792156cfd67546e35490c409c520b99664
3
+ size 4728
checkpoint-15633/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 2,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 3,
10
+ "finetuning_task": "text-classification",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "id2label": {
15
+ "0": "HAPPY",
16
+ "1": "SAD"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 4096,
20
+ "label2id": {
21
+ "HAPPY": 0,
22
+ "SAD": 1
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "transformers_version": "5.0.0.dev0",
33
+ "type_vocab_size": 2,
34
+ "use_cache": false,
35
+ "vocab_size": 48000
36
+ }
checkpoint-15633/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45d967b4d0ab75d579d47f05cac9a4298b0a73260e0fee3415bd9839b5b82998
3
+ size 1412212656
checkpoint-15633/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc6942464893a7c9e32df66009a1d630143ef6d2017fbd42dcda8f6567bc4361
3
+ size 2824660205
checkpoint-15633/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa3e5f9dfd67c2c16e2ff5aea81baa45050a9ae7193b31484cbfebb216b582e5
3
+ size 14244
checkpoint-15633/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2030a19651e57c5517d754d6ce02fd147fd15f8b0cf9227e5cf85c5e394836
3
+ size 1064
checkpoint-15633/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-15633/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "</s>",
7
+ "is_local": false,
8
+ "mask_token": "<mask>",
9
+ "max_length": null,
10
+ "model_input_names": [
11
+ "input_ids",
12
+ "attention_mask"
13
+ ],
14
+ "model_max_length": 512,
15
+ "pad_to_multiple_of": null,
16
+ "pad_token": "<pad>",
17
+ "pad_token_type_id": 0,
18
+ "padding_side": "right",
19
+ "sep_token": "</s>",
20
+ "tokenizer_class": "TokenizersBackend",
21
+ "truncation_side": "right",
22
+ "unk_token": "<unk>"
23
+ }
checkpoint-15633/trainer_state.json ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 15633,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09595087315294569,
14
+ "grad_norm": 12.811051368713379,
15
+ "learning_rate": 1.936160685728907e-05,
16
+ "loss": 0.435078857421875,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.19190174630589138,
21
+ "grad_norm": 8.909839630126953,
22
+ "learning_rate": 1.8721934369602766e-05,
23
+ "loss": 0.38166162109375,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.28785261945883706,
28
+ "grad_norm": 23.91183853149414,
29
+ "learning_rate": 1.8082261881916462e-05,
30
+ "loss": 0.3671803283691406,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.38380349261178276,
35
+ "grad_norm": 13.278630256652832,
36
+ "learning_rate": 1.7442589394230155e-05,
37
+ "loss": 0.34918722534179686,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.47975436576472846,
42
+ "grad_norm": 6.475880146026611,
43
+ "learning_rate": 1.680291690654385e-05,
44
+ "loss": 0.35130245971679686,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.5757052389176741,
49
+ "grad_norm": 5.953056812286377,
50
+ "learning_rate": 1.6163244418857547e-05,
51
+ "loss": 0.363922607421875,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.6716561120706198,
56
+ "grad_norm": 9.432499885559082,
57
+ "learning_rate": 1.5523571931171243e-05,
58
+ "loss": 0.37142852783203123,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.7676069852235655,
63
+ "grad_norm": 3.409161329269409,
64
+ "learning_rate": 1.4883899443484937e-05,
65
+ "loss": 0.3608262023925781,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.8635578583765112,
70
+ "grad_norm": 1.59526789188385,
71
+ "learning_rate": 1.4244226955798633e-05,
72
+ "loss": 0.3568770446777344,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.9595087315294569,
77
+ "grad_norm": 4.420688629150391,
78
+ "learning_rate": 1.3604554468112327e-05,
79
+ "loss": 0.3340596923828125,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 1.0554596046824025,
84
+ "grad_norm": 8.594019889831543,
85
+ "learning_rate": 1.2964881980426023e-05,
86
+ "loss": 0.3522498779296875,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 1.1514104778353482,
91
+ "grad_norm": 3.242337226867676,
92
+ "learning_rate": 1.2325209492739718e-05,
93
+ "loss": 0.32486505126953125,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 1.247361350988294,
98
+ "grad_norm": 20.167972564697266,
99
+ "learning_rate": 1.1685537005053414e-05,
100
+ "loss": 0.3254352416992187,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 1.3433122241412396,
105
+ "grad_norm": 4.983191967010498,
106
+ "learning_rate": 1.1045864517367108e-05,
107
+ "loss": 0.330927490234375,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 1.4392630972941853,
112
+ "grad_norm": 1.6593592166900635,
113
+ "learning_rate": 1.0406192029680804e-05,
114
+ "loss": 0.3213047485351562,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 1.535213970447131,
119
+ "grad_norm": 17.288793563842773,
120
+ "learning_rate": 9.7665195419945e-06,
121
+ "loss": 0.3227524719238281,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 1.6311648436000767,
126
+ "grad_norm": 3.9145667552948,
127
+ "learning_rate": 9.126847054308194e-06,
128
+ "loss": 0.3189747924804687,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 1.7271157167530224,
133
+ "grad_norm": 3.7356808185577393,
134
+ "learning_rate": 8.48717456662189e-06,
135
+ "loss": 0.33311187744140625,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 1.8230665899059681,
140
+ "grad_norm": 15.458077430725098,
141
+ "learning_rate": 7.847502078935586e-06,
142
+ "loss": 0.3166582946777344,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 1.9190174630589139,
147
+ "grad_norm": 0.5702362060546875,
148
+ "learning_rate": 7.207829591249281e-06,
149
+ "loss": 0.3185113525390625,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 2.0149683362118593,
154
+ "grad_norm": 20.679962158203125,
155
+ "learning_rate": 6.568157103562976e-06,
156
+ "loss": 0.29407708740234373,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 2.110919209364805,
161
+ "grad_norm": 1.0656015872955322,
162
+ "learning_rate": 5.928484615876672e-06,
163
+ "loss": 0.27719247436523436,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 2.2068700825177507,
168
+ "grad_norm": 2.5045535564422607,
169
+ "learning_rate": 5.288812128190367e-06,
170
+ "loss": 0.2760169067382813,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 2.3028209556706964,
175
+ "grad_norm": 0.9866521954536438,
176
+ "learning_rate": 4.649139640504062e-06,
177
+ "loss": 0.25182223510742185,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 2.398771828823642,
182
+ "grad_norm": 0.861988365650177,
183
+ "learning_rate": 4.0094671528177575e-06,
184
+ "loss": 0.27130630493164065,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 2.494722701976588,
189
+ "grad_norm": 3.025139808654785,
190
+ "learning_rate": 3.3697946651314527e-06,
191
+ "loss": 0.2797826232910156,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 2.5906735751295336,
196
+ "grad_norm": 5.455494403839111,
197
+ "learning_rate": 2.7301221774451482e-06,
198
+ "loss": 0.274226806640625,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 2.6866244482824793,
203
+ "grad_norm": 10.867804527282715,
204
+ "learning_rate": 2.0904496897588438e-06,
205
+ "loss": 0.2681912841796875,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 2.782575321435425,
210
+ "grad_norm": 2.9777698516845703,
211
+ "learning_rate": 1.4507772020725389e-06,
212
+ "loss": 0.2753703002929688,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 2.8785261945883707,
217
+ "grad_norm": 14.00378704071045,
218
+ "learning_rate": 8.111047143862342e-07,
219
+ "loss": 0.2732333984375,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 2.9744770677413164,
224
+ "grad_norm": 4.573943614959717,
225
+ "learning_rate": 1.7143222669992966e-07,
226
+ "loss": 0.26397906494140627,
227
+ "step": 15500
228
+ }
229
+ ],
230
+ "logging_steps": 500,
231
+ "max_steps": 15633,
232
+ "num_input_tokens_seen": 0,
233
+ "num_train_epochs": 3,
234
+ "save_steps": 500,
235
+ "stateful_callbacks": {
236
+ "TrainerControl": {
237
+ "args": {
238
+ "should_epoch_stop": false,
239
+ "should_evaluate": false,
240
+ "should_log": false,
241
+ "should_save": true,
242
+ "should_training_stop": true
243
+ },
244
+ "attributes": {}
245
+ }
246
+ },
247
+ "total_flos": 1.4568883002906624e+17,
248
+ "train_batch_size": 10,
249
+ "trial_name": null,
250
+ "trial_params": null
251
+ }
checkpoint-15633/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7010d8e51577a38b5b304dc130057e792156cfd67546e35490c409c520b99664
3
+ size 4728
checkpoint-5211/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 2,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 3,
10
+ "finetuning_task": "text-classification",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "id2label": {
15
+ "0": "HAPPY",
16
+ "1": "SAD"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 4096,
20
+ "label2id": {
21
+ "HAPPY": 0,
22
+ "SAD": 1
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "transformers_version": "5.0.0.dev0",
33
+ "type_vocab_size": 2,
34
+ "use_cache": false,
35
+ "vocab_size": 48000
36
+ }
checkpoint-5211/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54a3674be785ff96c3dc6a9c15c440a73466da3d4a866e70f3d82230d94a416e
3
+ size 1412212656
checkpoint-5211/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbf780085c2962571d3082a65276a5f305a79509e570593b2f510964c579c9c3
3
+ size 2824660205
checkpoint-5211/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16e18830d2e1b32b5415909b9f213b785b54d311c702e4a81e63c36ee0999346
3
+ size 14244
checkpoint-5211/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d467fbe9fcf92f32237bcdf9a6a80ac02d3f9dd0dbc5dc7c9bb09695101e739
3
+ size 1064
checkpoint-5211/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-5211/tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "</s>",
7
+ "is_local": false,
8
+ "mask_token": "<mask>",
9
+ "max_length": null,
10
+ "model_input_names": [
11
+ "input_ids",
12
+ "attention_mask"
13
+ ],
14
+ "model_max_length": 512,
15
+ "pad_to_multiple_of": null,
16
+ "pad_token": "<pad>",
17
+ "pad_token_type_id": 0,
18
+ "padding_side": "right",
19
+ "sep_token": "</s>",
20
+ "tokenizer_class": "TokenizersBackend",
21
+ "truncation_side": "right",
22
+ "unk_token": "<unk>"
23
+ }
checkpoint-5211/trainer_state.json ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 5211,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09595087315294569,
14
+ "grad_norm": 12.811051368713379,
15
+ "learning_rate": 1.936160685728907e-05,
16
+ "loss": 0.435078857421875,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.19190174630589138,
21
+ "grad_norm": 8.909839630126953,
22
+ "learning_rate": 1.8721934369602766e-05,
23
+ "loss": 0.38166162109375,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.28785261945883706,
28
+ "grad_norm": 23.91183853149414,
29
+ "learning_rate": 1.8082261881916462e-05,
30
+ "loss": 0.3671803283691406,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.38380349261178276,
35
+ "grad_norm": 13.278630256652832,
36
+ "learning_rate": 1.7442589394230155e-05,
37
+ "loss": 0.34918722534179686,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.47975436576472846,
42
+ "grad_norm": 6.475880146026611,
43
+ "learning_rate": 1.680291690654385e-05,
44
+ "loss": 0.35130245971679686,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.5757052389176741,
49
+ "grad_norm": 5.953056812286377,
50
+ "learning_rate": 1.6163244418857547e-05,
51
+ "loss": 0.363922607421875,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.6716561120706198,
56
+ "grad_norm": 9.432499885559082,
57
+ "learning_rate": 1.5523571931171243e-05,
58
+ "loss": 0.37142852783203123,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.7676069852235655,
63
+ "grad_norm": 3.409161329269409,
64
+ "learning_rate": 1.4883899443484937e-05,
65
+ "loss": 0.3608262023925781,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.8635578583765112,
70
+ "grad_norm": 1.59526789188385,
71
+ "learning_rate": 1.4244226955798633e-05,
72
+ "loss": 0.3568770446777344,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.9595087315294569,
77
+ "grad_norm": 4.420688629150391,
78
+ "learning_rate": 1.3604554468112327e-05,
79
+ "loss": 0.3340596923828125,
80
+ "step": 5000
81
+ }
82
+ ],
83
+ "logging_steps": 500,
84
+ "max_steps": 15633,
85
+ "num_input_tokens_seen": 0,
86
+ "num_train_epochs": 3,
87
+ "save_steps": 500,
88
+ "stateful_callbacks": {
89
+ "TrainerControl": {
90
+ "args": {
91
+ "should_epoch_stop": false,
92
+ "should_evaluate": false,
93
+ "should_log": false,
94
+ "should_save": true,
95
+ "should_training_stop": false
96
+ },
97
+ "attributes": {}
98
+ }
99
+ },
100
+ "total_flos": 4.856294334302208e+16,
101
+ "train_batch_size": 10,
102
+ "trial_name": null,
103
+ "trial_params": null
104
+ }
checkpoint-5211/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7010d8e51577a38b5b304dc130057e792156cfd67546e35490c409c520b99664
3
+ size 4728
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 2,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 3,
10
+ "finetuning_task": "text-classification",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 1024,
14
+ "id2label": {
15
+ "0": "HAPPY",
16
+ "1": "SAD"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 4096,
20
+ "label2id": {
21
+ "HAPPY": 0,
22
+ "SAD": 1
23
+ },
24
+ "layer_norm_eps": 1e-12,
25
+ "max_position_embeddings": 512,
26
+ "model_type": "bert",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 24,
29
+ "pad_token_id": 0,
30
+ "position_embedding_type": "absolute",
31
+ "problem_type": "single_label_classification",
32
+ "transformers_version": "5.0.0.dev0",
33
+ "type_vocab_size": 2,
34
+ "use_cache": false,
35
+ "vocab_size": 48000
36
+ }
eval_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.8793330934388869,
4
+ "eval_loss": 0.36814266443252563,
5
+ "eval_runtime": 437.0819,
6
+ "eval_samples": 8337,
7
+ "eval_samples_per_second": 19.074,
8
+ "eval_steps_per_second": 2.386
9
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45d967b4d0ab75d579d47f05cac9a4298b0a73260e0fee3415bd9839b5b82998
3
+ size 1412212656
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<s>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "</s>",
7
+ "is_local": false,
8
+ "mask_token": "<mask>",
9
+ "max_length": null,
10
+ "model_input_names": [
11
+ "input_ids",
12
+ "attention_mask"
13
+ ],
14
+ "model_max_length": 512,
15
+ "pad_to_multiple_of": null,
16
+ "pad_token": "<pad>",
17
+ "pad_token_type_id": 0,
18
+ "padding_side": "right",
19
+ "sep_token": "</s>",
20
+ "tokenizer_class": "TokenizersBackend",
21
+ "truncation_side": "right",
22
+ "unk_token": "<unk>"
23
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.0,
3
+ "total_flos": 1.4568883002906624e+17,
4
+ "train_loss": 0.3201524627984222,
5
+ "train_runtime": 31218.1999,
6
+ "train_samples": 52110,
7
+ "train_samples_per_second": 5.008,
8
+ "train_steps_per_second": 0.501
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 15633,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09595087315294569,
14
+ "grad_norm": 12.811051368713379,
15
+ "learning_rate": 1.936160685728907e-05,
16
+ "loss": 0.435078857421875,
17
+ "step": 500
18
+ },
19
+ {
20
+ "epoch": 0.19190174630589138,
21
+ "grad_norm": 8.909839630126953,
22
+ "learning_rate": 1.8721934369602766e-05,
23
+ "loss": 0.38166162109375,
24
+ "step": 1000
25
+ },
26
+ {
27
+ "epoch": 0.28785261945883706,
28
+ "grad_norm": 23.91183853149414,
29
+ "learning_rate": 1.8082261881916462e-05,
30
+ "loss": 0.3671803283691406,
31
+ "step": 1500
32
+ },
33
+ {
34
+ "epoch": 0.38380349261178276,
35
+ "grad_norm": 13.278630256652832,
36
+ "learning_rate": 1.7442589394230155e-05,
37
+ "loss": 0.34918722534179686,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.47975436576472846,
42
+ "grad_norm": 6.475880146026611,
43
+ "learning_rate": 1.680291690654385e-05,
44
+ "loss": 0.35130245971679686,
45
+ "step": 2500
46
+ },
47
+ {
48
+ "epoch": 0.5757052389176741,
49
+ "grad_norm": 5.953056812286377,
50
+ "learning_rate": 1.6163244418857547e-05,
51
+ "loss": 0.363922607421875,
52
+ "step": 3000
53
+ },
54
+ {
55
+ "epoch": 0.6716561120706198,
56
+ "grad_norm": 9.432499885559082,
57
+ "learning_rate": 1.5523571931171243e-05,
58
+ "loss": 0.37142852783203123,
59
+ "step": 3500
60
+ },
61
+ {
62
+ "epoch": 0.7676069852235655,
63
+ "grad_norm": 3.409161329269409,
64
+ "learning_rate": 1.4883899443484937e-05,
65
+ "loss": 0.3608262023925781,
66
+ "step": 4000
67
+ },
68
+ {
69
+ "epoch": 0.8635578583765112,
70
+ "grad_norm": 1.59526789188385,
71
+ "learning_rate": 1.4244226955798633e-05,
72
+ "loss": 0.3568770446777344,
73
+ "step": 4500
74
+ },
75
+ {
76
+ "epoch": 0.9595087315294569,
77
+ "grad_norm": 4.420688629150391,
78
+ "learning_rate": 1.3604554468112327e-05,
79
+ "loss": 0.3340596923828125,
80
+ "step": 5000
81
+ },
82
+ {
83
+ "epoch": 1.0554596046824025,
84
+ "grad_norm": 8.594019889831543,
85
+ "learning_rate": 1.2964881980426023e-05,
86
+ "loss": 0.3522498779296875,
87
+ "step": 5500
88
+ },
89
+ {
90
+ "epoch": 1.1514104778353482,
91
+ "grad_norm": 3.242337226867676,
92
+ "learning_rate": 1.2325209492739718e-05,
93
+ "loss": 0.32486505126953125,
94
+ "step": 6000
95
+ },
96
+ {
97
+ "epoch": 1.247361350988294,
98
+ "grad_norm": 20.167972564697266,
99
+ "learning_rate": 1.1685537005053414e-05,
100
+ "loss": 0.3254352416992187,
101
+ "step": 6500
102
+ },
103
+ {
104
+ "epoch": 1.3433122241412396,
105
+ "grad_norm": 4.983191967010498,
106
+ "learning_rate": 1.1045864517367108e-05,
107
+ "loss": 0.330927490234375,
108
+ "step": 7000
109
+ },
110
+ {
111
+ "epoch": 1.4392630972941853,
112
+ "grad_norm": 1.6593592166900635,
113
+ "learning_rate": 1.0406192029680804e-05,
114
+ "loss": 0.3213047485351562,
115
+ "step": 7500
116
+ },
117
+ {
118
+ "epoch": 1.535213970447131,
119
+ "grad_norm": 17.288793563842773,
120
+ "learning_rate": 9.7665195419945e-06,
121
+ "loss": 0.3227524719238281,
122
+ "step": 8000
123
+ },
124
+ {
125
+ "epoch": 1.6311648436000767,
126
+ "grad_norm": 3.9145667552948,
127
+ "learning_rate": 9.126847054308194e-06,
128
+ "loss": 0.3189747924804687,
129
+ "step": 8500
130
+ },
131
+ {
132
+ "epoch": 1.7271157167530224,
133
+ "grad_norm": 3.7356808185577393,
134
+ "learning_rate": 8.48717456662189e-06,
135
+ "loss": 0.33311187744140625,
136
+ "step": 9000
137
+ },
138
+ {
139
+ "epoch": 1.8230665899059681,
140
+ "grad_norm": 15.458077430725098,
141
+ "learning_rate": 7.847502078935586e-06,
142
+ "loss": 0.3166582946777344,
143
+ "step": 9500
144
+ },
145
+ {
146
+ "epoch": 1.9190174630589139,
147
+ "grad_norm": 0.5702362060546875,
148
+ "learning_rate": 7.207829591249281e-06,
149
+ "loss": 0.3185113525390625,
150
+ "step": 10000
151
+ },
152
+ {
153
+ "epoch": 2.0149683362118593,
154
+ "grad_norm": 20.679962158203125,
155
+ "learning_rate": 6.568157103562976e-06,
156
+ "loss": 0.29407708740234373,
157
+ "step": 10500
158
+ },
159
+ {
160
+ "epoch": 2.110919209364805,
161
+ "grad_norm": 1.0656015872955322,
162
+ "learning_rate": 5.928484615876672e-06,
163
+ "loss": 0.27719247436523436,
164
+ "step": 11000
165
+ },
166
+ {
167
+ "epoch": 2.2068700825177507,
168
+ "grad_norm": 2.5045535564422607,
169
+ "learning_rate": 5.288812128190367e-06,
170
+ "loss": 0.2760169067382813,
171
+ "step": 11500
172
+ },
173
+ {
174
+ "epoch": 2.3028209556706964,
175
+ "grad_norm": 0.9866521954536438,
176
+ "learning_rate": 4.649139640504062e-06,
177
+ "loss": 0.25182223510742185,
178
+ "step": 12000
179
+ },
180
+ {
181
+ "epoch": 2.398771828823642,
182
+ "grad_norm": 0.861988365650177,
183
+ "learning_rate": 4.0094671528177575e-06,
184
+ "loss": 0.27130630493164065,
185
+ "step": 12500
186
+ },
187
+ {
188
+ "epoch": 2.494722701976588,
189
+ "grad_norm": 3.025139808654785,
190
+ "learning_rate": 3.3697946651314527e-06,
191
+ "loss": 0.2797826232910156,
192
+ "step": 13000
193
+ },
194
+ {
195
+ "epoch": 2.5906735751295336,
196
+ "grad_norm": 5.455494403839111,
197
+ "learning_rate": 2.7301221774451482e-06,
198
+ "loss": 0.274226806640625,
199
+ "step": 13500
200
+ },
201
+ {
202
+ "epoch": 2.6866244482824793,
203
+ "grad_norm": 10.867804527282715,
204
+ "learning_rate": 2.0904496897588438e-06,
205
+ "loss": 0.2681912841796875,
206
+ "step": 14000
207
+ },
208
+ {
209
+ "epoch": 2.782575321435425,
210
+ "grad_norm": 2.9777698516845703,
211
+ "learning_rate": 1.4507772020725389e-06,
212
+ "loss": 0.2753703002929688,
213
+ "step": 14500
214
+ },
215
+ {
216
+ "epoch": 2.8785261945883707,
217
+ "grad_norm": 14.00378704071045,
218
+ "learning_rate": 8.111047143862342e-07,
219
+ "loss": 0.2732333984375,
220
+ "step": 15000
221
+ },
222
+ {
223
+ "epoch": 2.9744770677413164,
224
+ "grad_norm": 4.573943614959717,
225
+ "learning_rate": 1.7143222669992966e-07,
226
+ "loss": 0.26397906494140627,
227
+ "step": 15500
228
+ },
229
+ {
230
+ "epoch": 3.0,
231
+ "step": 15633,
232
+ "total_flos": 1.4568883002906624e+17,
233
+ "train_loss": 0.3201524627984222,
234
+ "train_runtime": 31218.1999,
235
+ "train_samples_per_second": 5.008,
236
+ "train_steps_per_second": 0.501
237
+ }
238
+ ],
239
+ "logging_steps": 500,
240
+ "max_steps": 15633,
241
+ "num_input_tokens_seen": 0,
242
+ "num_train_epochs": 3,
243
+ "save_steps": 500,
244
+ "stateful_callbacks": {
245
+ "TrainerControl": {
246
+ "args": {
247
+ "should_epoch_stop": false,
248
+ "should_evaluate": false,
249
+ "should_log": false,
250
+ "should_save": true,
251
+ "should_training_stop": true
252
+ },
253
+ "attributes": {}
254
+ }
255
+ },
256
+ "total_flos": 1.4568883002906624e+17,
257
+ "train_batch_size": 10,
258
+ "trial_name": null,
259
+ "trial_params": null
260
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7010d8e51577a38b5b304dc130057e792156cfd67546e35490c409c520b99664
3
+ size 4728