crown commited on
Commit
43e8a43
·
verified ·
1 Parent(s): b50dc74

Upload 11 files

Browse files
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "../../bert",
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "Agricultural and Biological Sciences",
14
+ "1": "Biochemistry, Genetics and Molecular Biology",
15
+ "2": "Chemical Engineering",
16
+ "3": "Chemistry",
17
+ "4": "Computer Science",
18
+ "5": "Earth and Planetary",
19
+ "6": "Economics, Econometrics and Finance",
20
+ "7": "Engineering",
21
+ "8": "Food Science",
22
+ "9": "Immunology and Microbiology",
23
+ "10": "Materials Science",
24
+ "11": "Mathematics",
25
+ "12": "Medicine and Dentistry",
26
+ "13": "Neuroscience",
27
+ "14": "Nursing and Health Professions",
28
+ "15": "Pharmacology, Toxicology and Pharmaceutical Science",
29
+ "16": "Physics and Astronomy",
30
+ "17": "Psychology",
31
+ "18": "Social Sciences",
32
+ "19": "Veterinary Science and Veterinary Medicine"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "intermediate_size": 3072,
36
+ "label2id": {
37
+ "Agricultural and Biological Sciences": 0,
38
+ "Biochemistry, Genetics and Molecular Biology": 1,
39
+ "Chemical Engineering": 2,
40
+ "Chemistry": 3,
41
+ "Computer Science": 4,
42
+ "Earth and Planetary": 5,
43
+ "Economics, Econometrics and Finance": 6,
44
+ "Engineering": 7,
45
+ "Food Science": 8,
46
+ "Immunology and Microbiology": 9,
47
+ "Materials Science": 10,
48
+ "Mathematics": 11,
49
+ "Medicine and Dentistry": 12,
50
+ "Neuroscience": 13,
51
+ "Nursing and Health Professions": 14,
52
+ "Pharmacology, Toxicology and Pharmaceutical Science": 15,
53
+ "Physics and Astronomy": 16,
54
+ "Psychology": 17,
55
+ "Social Sciences": 18,
56
+ "Veterinary Science and Veterinary Medicine": 19
57
+ },
58
+ "layer_norm_eps": 1e-12,
59
+ "max_position_embeddings": 512,
60
+ "model_type": "bert",
61
+ "num_attention_heads": 12,
62
+ "num_hidden_layers": 12,
63
+ "pad_token_id": 0,
64
+ "position_embedding_type": "absolute",
65
+ "problem_type": "multi_label_classification",
66
+ "torch_dtype": "float32",
67
+ "transformers_version": "4.42.3",
68
+ "type_vocab_size": 2,
69
+ "use_cache": true,
70
+ "vocab_size": 30522
71
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4e5f7e9bad60fe639dedd44471ae1cf5d97f22ebad783fe753dbc72ed593132
3
+ size 438014016
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bbb1ce5faf8f2cb9776c5566d987764c02e434caf69a6cbf2fa200178735f92
3
+ size 876149114
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72351c5ca54abf9f46ad9d15a8a913849131486ed6f9ad78ca81a1c8aac71b82
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e27a7631631d7147f4c96c542ba700cdde3381d1d59738c37fc7222b83ff3ab9
3
+ size 1064
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
trainer_state.json ADDED
@@ -0,0 +1,1586 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.759579488098438,
3
+ "best_model_checkpoint": "bert-finetuned-sem_eval-english\\checkpoint-85976",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 107470,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.023262305759746905,
13
+ "grad_norm": 0.5290595293045044,
14
+ "learning_rate": 1.9906950776961013e-05,
15
+ "loss": 0.2421,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.04652461151949381,
20
+ "grad_norm": 0.7135451436042786,
21
+ "learning_rate": 1.9813901553922027e-05,
22
+ "loss": 0.153,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.06978691727924072,
27
+ "grad_norm": 0.6057224869728088,
28
+ "learning_rate": 1.9720852330883038e-05,
29
+ "loss": 0.135,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.09304922303898762,
34
+ "grad_norm": 2.033938407897949,
35
+ "learning_rate": 1.962780310784405e-05,
36
+ "loss": 0.1273,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.11631152879873453,
41
+ "grad_norm": 1.6713635921478271,
42
+ "learning_rate": 1.9534753884805063e-05,
43
+ "loss": 0.1221,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.13957383455848144,
48
+ "grad_norm": 0.4952755570411682,
49
+ "learning_rate": 1.9441704661766077e-05,
50
+ "loss": 0.116,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.16283614031822835,
55
+ "grad_norm": 0.5051670670509338,
56
+ "learning_rate": 1.9348655438727088e-05,
57
+ "loss": 0.1147,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.18609844607797524,
62
+ "grad_norm": 0.9887399077415466,
63
+ "learning_rate": 1.92556062156881e-05,
64
+ "loss": 0.1152,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.20936075183772215,
69
+ "grad_norm": 0.7931112051010132,
70
+ "learning_rate": 1.9162556992649114e-05,
71
+ "loss": 0.1138,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.23262305759746907,
76
+ "grad_norm": 0.7627053260803223,
77
+ "learning_rate": 1.9069507769610124e-05,
78
+ "loss": 0.113,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.25588536335721596,
83
+ "grad_norm": 0.6043164730072021,
84
+ "learning_rate": 1.897645854657114e-05,
85
+ "loss": 0.1119,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.27914766911696287,
90
+ "grad_norm": 0.5762751698493958,
91
+ "learning_rate": 1.888340932353215e-05,
92
+ "loss": 0.1107,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.3024099748767098,
97
+ "grad_norm": 0.6853972673416138,
98
+ "learning_rate": 1.8790360100493164e-05,
99
+ "loss": 0.1075,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.3256722806364567,
104
+ "grad_norm": 0.6911000609397888,
105
+ "learning_rate": 1.8697310877454175e-05,
106
+ "loss": 0.1079,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.3489345863962036,
111
+ "grad_norm": 0.5828253626823425,
112
+ "learning_rate": 1.860426165441519e-05,
113
+ "loss": 0.1091,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 0.3721968921559505,
118
+ "grad_norm": 0.782696008682251,
119
+ "learning_rate": 1.85112124313762e-05,
120
+ "loss": 0.1084,
121
+ "step": 8000
122
+ },
123
+ {
124
+ "epoch": 0.3954591979156974,
125
+ "grad_norm": 0.8830183148384094,
126
+ "learning_rate": 1.841816320833721e-05,
127
+ "loss": 0.1059,
128
+ "step": 8500
129
+ },
130
+ {
131
+ "epoch": 0.4187215036754443,
132
+ "grad_norm": 1.4506700038909912,
133
+ "learning_rate": 1.8325113985298225e-05,
134
+ "loss": 0.1058,
135
+ "step": 9000
136
+ },
137
+ {
138
+ "epoch": 0.4419838094351912,
139
+ "grad_norm": 1.0449475049972534,
140
+ "learning_rate": 1.8232064762259236e-05,
141
+ "loss": 0.1067,
142
+ "step": 9500
143
+ },
144
+ {
145
+ "epoch": 0.46524611519493814,
146
+ "grad_norm": 1.4629307985305786,
147
+ "learning_rate": 1.8139015539220247e-05,
148
+ "loss": 0.1031,
149
+ "step": 10000
150
+ },
151
+ {
152
+ "epoch": 0.48850842095468505,
153
+ "grad_norm": 0.5856618285179138,
154
+ "learning_rate": 1.804596631618126e-05,
155
+ "loss": 0.1064,
156
+ "step": 10500
157
+ },
158
+ {
159
+ "epoch": 0.5117707267144319,
160
+ "grad_norm": 0.7551602125167847,
161
+ "learning_rate": 1.7952917093142276e-05,
162
+ "loss": 0.1029,
163
+ "step": 11000
164
+ },
165
+ {
166
+ "epoch": 0.5350330324741789,
167
+ "grad_norm": 0.762008011341095,
168
+ "learning_rate": 1.7859867870103287e-05,
169
+ "loss": 0.1043,
170
+ "step": 11500
171
+ },
172
+ {
173
+ "epoch": 0.5582953382339257,
174
+ "grad_norm": 0.9580531716346741,
175
+ "learning_rate": 1.7766818647064298e-05,
176
+ "loss": 0.1048,
177
+ "step": 12000
178
+ },
179
+ {
180
+ "epoch": 0.5815576439936726,
181
+ "grad_norm": 1.0684598684310913,
182
+ "learning_rate": 1.7673769424025312e-05,
183
+ "loss": 0.1041,
184
+ "step": 12500
185
+ },
186
+ {
187
+ "epoch": 0.6048199497534196,
188
+ "grad_norm": 1.6208114624023438,
189
+ "learning_rate": 1.7580720200986323e-05,
190
+ "loss": 0.1005,
191
+ "step": 13000
192
+ },
193
+ {
194
+ "epoch": 0.6280822555131664,
195
+ "grad_norm": 1.0143775939941406,
196
+ "learning_rate": 1.7487670977947334e-05,
197
+ "loss": 0.1012,
198
+ "step": 13500
199
+ },
200
+ {
201
+ "epoch": 0.6513445612729134,
202
+ "grad_norm": 0.8733798861503601,
203
+ "learning_rate": 1.7394621754908348e-05,
204
+ "loss": 0.1009,
205
+ "step": 14000
206
+ },
207
+ {
208
+ "epoch": 0.6746068670326603,
209
+ "grad_norm": 1.1354494094848633,
210
+ "learning_rate": 1.730157253186936e-05,
211
+ "loss": 0.1012,
212
+ "step": 14500
213
+ },
214
+ {
215
+ "epoch": 0.6978691727924072,
216
+ "grad_norm": 0.852428674697876,
217
+ "learning_rate": 1.720852330883037e-05,
218
+ "loss": 0.1003,
219
+ "step": 15000
220
+ },
221
+ {
222
+ "epoch": 0.7211314785521541,
223
+ "grad_norm": 0.6298101544380188,
224
+ "learning_rate": 1.7115474085791384e-05,
225
+ "loss": 0.1009,
226
+ "step": 15500
227
+ },
228
+ {
229
+ "epoch": 0.744393784311901,
230
+ "grad_norm": 0.6968585252761841,
231
+ "learning_rate": 1.70224248627524e-05,
232
+ "loss": 0.1002,
233
+ "step": 16000
234
+ },
235
+ {
236
+ "epoch": 0.7676560900716479,
237
+ "grad_norm": 0.7687616944313049,
238
+ "learning_rate": 1.692937563971341e-05,
239
+ "loss": 0.0976,
240
+ "step": 16500
241
+ },
242
+ {
243
+ "epoch": 0.7909183958313948,
244
+ "grad_norm": 0.9958521723747253,
245
+ "learning_rate": 1.6836326416674424e-05,
246
+ "loss": 0.0996,
247
+ "step": 17000
248
+ },
249
+ {
250
+ "epoch": 0.8141807015911418,
251
+ "grad_norm": 1.9353946447372437,
252
+ "learning_rate": 1.6743277193635435e-05,
253
+ "loss": 0.0982,
254
+ "step": 17500
255
+ },
256
+ {
257
+ "epoch": 0.8374430073508886,
258
+ "grad_norm": 0.4970337450504303,
259
+ "learning_rate": 1.6650227970596446e-05,
260
+ "loss": 0.1013,
261
+ "step": 18000
262
+ },
263
+ {
264
+ "epoch": 0.8607053131106355,
265
+ "grad_norm": 1.3484145402908325,
266
+ "learning_rate": 1.655717874755746e-05,
267
+ "loss": 0.1001,
268
+ "step": 18500
269
+ },
270
+ {
271
+ "epoch": 0.8839676188703824,
272
+ "grad_norm": 0.9516633749008179,
273
+ "learning_rate": 1.646412952451847e-05,
274
+ "loss": 0.0978,
275
+ "step": 19000
276
+ },
277
+ {
278
+ "epoch": 0.9072299246301293,
279
+ "grad_norm": 1.358478307723999,
280
+ "learning_rate": 1.6371080301479485e-05,
281
+ "loss": 0.1016,
282
+ "step": 19500
283
+ },
284
+ {
285
+ "epoch": 0.9304922303898763,
286
+ "grad_norm": 0.5643423795700073,
287
+ "learning_rate": 1.6278031078440496e-05,
288
+ "loss": 0.0983,
289
+ "step": 20000
290
+ },
291
+ {
292
+ "epoch": 0.9537545361496231,
293
+ "grad_norm": 0.532564103603363,
294
+ "learning_rate": 1.618498185540151e-05,
295
+ "loss": 0.0972,
296
+ "step": 20500
297
+ },
298
+ {
299
+ "epoch": 0.9770168419093701,
300
+ "grad_norm": 1.3922828435897827,
301
+ "learning_rate": 1.609193263236252e-05,
302
+ "loss": 0.0976,
303
+ "step": 21000
304
+ },
305
+ {
306
+ "epoch": 1.0,
307
+ "eval_accuracy": 0.5307062436028659,
308
+ "eval_f1": 0.7361535927721048,
309
+ "eval_loss": 0.09595564752817154,
310
+ "eval_roc_auc": 0.8461935311283144,
311
+ "eval_runtime": 24.262,
312
+ "eval_samples_per_second": 885.911,
313
+ "eval_steps_per_second": 110.749,
314
+ "step": 21494
315
+ },
316
+ {
317
+ "epoch": 1.0002791476691169,
318
+ "grad_norm": 0.6412343978881836,
319
+ "learning_rate": 1.5998883409323532e-05,
320
+ "loss": 0.0986,
321
+ "step": 21500
322
+ },
323
+ {
324
+ "epoch": 1.0235414534288638,
325
+ "grad_norm": 0.6500938534736633,
326
+ "learning_rate": 1.5905834186284547e-05,
327
+ "loss": 0.0889,
328
+ "step": 22000
329
+ },
330
+ {
331
+ "epoch": 1.0468037591886108,
332
+ "grad_norm": 0.49381542205810547,
333
+ "learning_rate": 1.5812784963245558e-05,
334
+ "loss": 0.0861,
335
+ "step": 22500
336
+ },
337
+ {
338
+ "epoch": 1.0700660649483578,
339
+ "grad_norm": 0.790490448474884,
340
+ "learning_rate": 1.571973574020657e-05,
341
+ "loss": 0.0853,
342
+ "step": 23000
343
+ },
344
+ {
345
+ "epoch": 1.0933283707081045,
346
+ "grad_norm": 1.0638964176177979,
347
+ "learning_rate": 1.5626686517167583e-05,
348
+ "loss": 0.0855,
349
+ "step": 23500
350
+ },
351
+ {
352
+ "epoch": 1.1165906764678515,
353
+ "grad_norm": 0.8379644751548767,
354
+ "learning_rate": 1.5533637294128597e-05,
355
+ "loss": 0.0839,
356
+ "step": 24000
357
+ },
358
+ {
359
+ "epoch": 1.1398529822275985,
360
+ "grad_norm": 1.5022120475769043,
361
+ "learning_rate": 1.5440588071089608e-05,
362
+ "loss": 0.0883,
363
+ "step": 24500
364
+ },
365
+ {
366
+ "epoch": 1.1631152879873454,
367
+ "grad_norm": 0.6825814247131348,
368
+ "learning_rate": 1.534753884805062e-05,
369
+ "loss": 0.0852,
370
+ "step": 25000
371
+ },
372
+ {
373
+ "epoch": 1.1863775937470922,
374
+ "grad_norm": 2.154118776321411,
375
+ "learning_rate": 1.5254489625011633e-05,
376
+ "loss": 0.0854,
377
+ "step": 25500
378
+ },
379
+ {
380
+ "epoch": 1.2096398995068391,
381
+ "grad_norm": 0.8945685029029846,
382
+ "learning_rate": 1.5161440401972644e-05,
383
+ "loss": 0.0862,
384
+ "step": 26000
385
+ },
386
+ {
387
+ "epoch": 1.2329022052665861,
388
+ "grad_norm": 1.1257520914077759,
389
+ "learning_rate": 1.5068391178933657e-05,
390
+ "loss": 0.0832,
391
+ "step": 26500
392
+ },
393
+ {
394
+ "epoch": 1.2561645110263329,
395
+ "grad_norm": 0.785380482673645,
396
+ "learning_rate": 1.497534195589467e-05,
397
+ "loss": 0.0842,
398
+ "step": 27000
399
+ },
400
+ {
401
+ "epoch": 1.2794268167860798,
402
+ "grad_norm": 0.4283202290534973,
403
+ "learning_rate": 1.488229273285568e-05,
404
+ "loss": 0.0859,
405
+ "step": 27500
406
+ },
407
+ {
408
+ "epoch": 1.3026891225458268,
409
+ "grad_norm": 1.3082115650177002,
410
+ "learning_rate": 1.4789243509816695e-05,
411
+ "loss": 0.0824,
412
+ "step": 28000
413
+ },
414
+ {
415
+ "epoch": 1.3259514283055736,
416
+ "grad_norm": 0.6663931608200073,
417
+ "learning_rate": 1.4696194286777707e-05,
418
+ "loss": 0.0868,
419
+ "step": 28500
420
+ },
421
+ {
422
+ "epoch": 1.3492137340653205,
423
+ "grad_norm": 1.093483805656433,
424
+ "learning_rate": 1.460314506373872e-05,
425
+ "loss": 0.0871,
426
+ "step": 29000
427
+ },
428
+ {
429
+ "epoch": 1.3724760398250675,
430
+ "grad_norm": 1.7370342016220093,
431
+ "learning_rate": 1.4510095840699731e-05,
432
+ "loss": 0.0858,
433
+ "step": 29500
434
+ },
435
+ {
436
+ "epoch": 1.3957383455848142,
437
+ "grad_norm": 1.280945062637329,
438
+ "learning_rate": 1.4417046617660744e-05,
439
+ "loss": 0.0853,
440
+ "step": 30000
441
+ },
442
+ {
443
+ "epoch": 1.4190006513445612,
444
+ "grad_norm": 0.44828563928604126,
445
+ "learning_rate": 1.4323997394621756e-05,
446
+ "loss": 0.0828,
447
+ "step": 30500
448
+ },
449
+ {
450
+ "epoch": 1.4422629571043082,
451
+ "grad_norm": 1.0420727729797363,
452
+ "learning_rate": 1.4230948171582769e-05,
453
+ "loss": 0.0836,
454
+ "step": 31000
455
+ },
456
+ {
457
+ "epoch": 1.4655252628640552,
458
+ "grad_norm": 1.529417634010315,
459
+ "learning_rate": 1.413789894854378e-05,
460
+ "loss": 0.0827,
461
+ "step": 31500
462
+ },
463
+ {
464
+ "epoch": 1.4887875686238021,
465
+ "grad_norm": 0.8359895944595337,
466
+ "learning_rate": 1.4044849725504792e-05,
467
+ "loss": 0.0836,
468
+ "step": 32000
469
+ },
470
+ {
471
+ "epoch": 1.5120498743835489,
472
+ "grad_norm": 0.8840139508247375,
473
+ "learning_rate": 1.3951800502465807e-05,
474
+ "loss": 0.0858,
475
+ "step": 32500
476
+ },
477
+ {
478
+ "epoch": 1.5353121801432958,
479
+ "grad_norm": 1.2289206981658936,
480
+ "learning_rate": 1.385875127942682e-05,
481
+ "loss": 0.0835,
482
+ "step": 33000
483
+ },
484
+ {
485
+ "epoch": 1.5585744859030428,
486
+ "grad_norm": 1.725092887878418,
487
+ "learning_rate": 1.376570205638783e-05,
488
+ "loss": 0.0833,
489
+ "step": 33500
490
+ },
491
+ {
492
+ "epoch": 1.5818367916627896,
493
+ "grad_norm": 1.8183008432388306,
494
+ "learning_rate": 1.3672652833348843e-05,
495
+ "loss": 0.085,
496
+ "step": 34000
497
+ },
498
+ {
499
+ "epoch": 1.6050990974225365,
500
+ "grad_norm": 1.5482715368270874,
501
+ "learning_rate": 1.3579603610309855e-05,
502
+ "loss": 0.0825,
503
+ "step": 34500
504
+ },
505
+ {
506
+ "epoch": 1.6283614031822835,
507
+ "grad_norm": 2.1967756748199463,
508
+ "learning_rate": 1.3486554387270866e-05,
509
+ "loss": 0.0857,
510
+ "step": 35000
511
+ },
512
+ {
513
+ "epoch": 1.6516237089420303,
514
+ "grad_norm": 0.9423213005065918,
515
+ "learning_rate": 1.3393505164231879e-05,
516
+ "loss": 0.0818,
517
+ "step": 35500
518
+ },
519
+ {
520
+ "epoch": 1.6748860147017772,
521
+ "grad_norm": 0.8981990218162537,
522
+ "learning_rate": 1.3300455941192892e-05,
523
+ "loss": 0.0821,
524
+ "step": 36000
525
+ },
526
+ {
527
+ "epoch": 1.6981483204615242,
528
+ "grad_norm": 1.125404715538025,
529
+ "learning_rate": 1.3207406718153904e-05,
530
+ "loss": 0.0849,
531
+ "step": 36500
532
+ },
533
+ {
534
+ "epoch": 1.721410626221271,
535
+ "grad_norm": 2.038687229156494,
536
+ "learning_rate": 1.3114357495114918e-05,
537
+ "loss": 0.0847,
538
+ "step": 37000
539
+ },
540
+ {
541
+ "epoch": 1.744672931981018,
542
+ "grad_norm": 1.0153677463531494,
543
+ "learning_rate": 1.302130827207593e-05,
544
+ "loss": 0.084,
545
+ "step": 37500
546
+ },
547
+ {
548
+ "epoch": 1.7679352377407649,
549
+ "grad_norm": 1.0724354982376099,
550
+ "learning_rate": 1.2928259049036942e-05,
551
+ "loss": 0.0825,
552
+ "step": 38000
553
+ },
554
+ {
555
+ "epoch": 1.7911975435005116,
556
+ "grad_norm": 0.6580795645713806,
557
+ "learning_rate": 1.2835209825997955e-05,
558
+ "loss": 0.0843,
559
+ "step": 38500
560
+ },
561
+ {
562
+ "epoch": 1.8144598492602588,
563
+ "grad_norm": 1.5284615755081177,
564
+ "learning_rate": 1.2742160602958966e-05,
565
+ "loss": 0.0858,
566
+ "step": 39000
567
+ },
568
+ {
569
+ "epoch": 1.8377221550200056,
570
+ "grad_norm": 2.040937900543213,
571
+ "learning_rate": 1.2649111379919978e-05,
572
+ "loss": 0.0838,
573
+ "step": 39500
574
+ },
575
+ {
576
+ "epoch": 1.8609844607797523,
577
+ "grad_norm": 1.1230378150939941,
578
+ "learning_rate": 1.255606215688099e-05,
579
+ "loss": 0.0822,
580
+ "step": 40000
581
+ },
582
+ {
583
+ "epoch": 1.8842467665394995,
584
+ "grad_norm": 0.6881332397460938,
585
+ "learning_rate": 1.2463012933842002e-05,
586
+ "loss": 0.0827,
587
+ "step": 40500
588
+ },
589
+ {
590
+ "epoch": 1.9075090722992463,
591
+ "grad_norm": 1.4516489505767822,
592
+ "learning_rate": 1.2369963710803016e-05,
593
+ "loss": 0.0797,
594
+ "step": 41000
595
+ },
596
+ {
597
+ "epoch": 1.9307713780589932,
598
+ "grad_norm": 0.8135964870452881,
599
+ "learning_rate": 1.2276914487764029e-05,
600
+ "loss": 0.0823,
601
+ "step": 41500
602
+ },
603
+ {
604
+ "epoch": 1.9540336838187402,
605
+ "grad_norm": 0.51099693775177,
606
+ "learning_rate": 1.2183865264725041e-05,
607
+ "loss": 0.0819,
608
+ "step": 42000
609
+ },
610
+ {
611
+ "epoch": 1.977295989578487,
612
+ "grad_norm": 1.2015933990478516,
613
+ "learning_rate": 1.2090816041686054e-05,
614
+ "loss": 0.0826,
615
+ "step": 42500
616
+ },
617
+ {
618
+ "epoch": 2.0,
619
+ "eval_accuracy": 0.5615520610402903,
620
+ "eval_f1": 0.7499260228316903,
621
+ "eval_loss": 0.09278739243745804,
622
+ "eval_roc_auc": 0.8556249389001388,
623
+ "eval_runtime": 23.2564,
624
+ "eval_samples_per_second": 924.221,
625
+ "eval_steps_per_second": 115.538,
626
+ "step": 42988
627
+ },
628
+ {
629
+ "epoch": 2.0005582953382337,
630
+ "grad_norm": 1.1591126918792725,
631
+ "learning_rate": 1.1997766818647065e-05,
632
+ "loss": 0.0807,
633
+ "step": 43000
634
+ },
635
+ {
636
+ "epoch": 2.023820601097981,
637
+ "grad_norm": 0.6749496459960938,
638
+ "learning_rate": 1.1904717595608077e-05,
639
+ "loss": 0.07,
640
+ "step": 43500
641
+ },
642
+ {
643
+ "epoch": 2.0470829068577276,
644
+ "grad_norm": 1.0793603658676147,
645
+ "learning_rate": 1.181166837256909e-05,
646
+ "loss": 0.0682,
647
+ "step": 44000
648
+ },
649
+ {
650
+ "epoch": 2.070345212617475,
651
+ "grad_norm": 1.9193094968795776,
652
+ "learning_rate": 1.1718619149530101e-05,
653
+ "loss": 0.0659,
654
+ "step": 44500
655
+ },
656
+ {
657
+ "epoch": 2.0936075183772216,
658
+ "grad_norm": 2.0090835094451904,
659
+ "learning_rate": 1.1625569926491114e-05,
660
+ "loss": 0.0661,
661
+ "step": 45000
662
+ },
663
+ {
664
+ "epoch": 2.1168698241369683,
665
+ "grad_norm": 1.252321720123291,
666
+ "learning_rate": 1.1532520703452128e-05,
667
+ "loss": 0.0663,
668
+ "step": 45500
669
+ },
670
+ {
671
+ "epoch": 2.1401321298967155,
672
+ "grad_norm": 0.25603464245796204,
673
+ "learning_rate": 1.143947148041314e-05,
674
+ "loss": 0.0671,
675
+ "step": 46000
676
+ },
677
+ {
678
+ "epoch": 2.1633944356564623,
679
+ "grad_norm": 0.9230429530143738,
680
+ "learning_rate": 1.1346422257374151e-05,
681
+ "loss": 0.0654,
682
+ "step": 46500
683
+ },
684
+ {
685
+ "epoch": 2.186656741416209,
686
+ "grad_norm": 0.8180581331253052,
687
+ "learning_rate": 1.1253373034335164e-05,
688
+ "loss": 0.0675,
689
+ "step": 47000
690
+ },
691
+ {
692
+ "epoch": 2.209919047175956,
693
+ "grad_norm": 1.928276777267456,
694
+ "learning_rate": 1.1160323811296177e-05,
695
+ "loss": 0.0686,
696
+ "step": 47500
697
+ },
698
+ {
699
+ "epoch": 2.233181352935703,
700
+ "grad_norm": 1.222936987876892,
701
+ "learning_rate": 1.106727458825719e-05,
702
+ "loss": 0.0706,
703
+ "step": 48000
704
+ },
705
+ {
706
+ "epoch": 2.2564436586954497,
707
+ "grad_norm": 0.6796595454216003,
708
+ "learning_rate": 1.09742253652182e-05,
709
+ "loss": 0.0681,
710
+ "step": 48500
711
+ },
712
+ {
713
+ "epoch": 2.279705964455197,
714
+ "grad_norm": 1.2472426891326904,
715
+ "learning_rate": 1.0881176142179213e-05,
716
+ "loss": 0.0677,
717
+ "step": 49000
718
+ },
719
+ {
720
+ "epoch": 2.3029682702149437,
721
+ "grad_norm": 1.2157268524169922,
722
+ "learning_rate": 1.0788126919140227e-05,
723
+ "loss": 0.0688,
724
+ "step": 49500
725
+ },
726
+ {
727
+ "epoch": 2.326230575974691,
728
+ "grad_norm": 1.0946940183639526,
729
+ "learning_rate": 1.069507769610124e-05,
730
+ "loss": 0.0669,
731
+ "step": 50000
732
+ },
733
+ {
734
+ "epoch": 2.3494928817344376,
735
+ "grad_norm": 2.56750750541687,
736
+ "learning_rate": 1.060202847306225e-05,
737
+ "loss": 0.065,
738
+ "step": 50500
739
+ },
740
+ {
741
+ "epoch": 2.3727551874941843,
742
+ "grad_norm": 0.8625161051750183,
743
+ "learning_rate": 1.0508979250023263e-05,
744
+ "loss": 0.0676,
745
+ "step": 51000
746
+ },
747
+ {
748
+ "epoch": 2.3960174932539315,
749
+ "grad_norm": 1.6813982725143433,
750
+ "learning_rate": 1.0415930026984276e-05,
751
+ "loss": 0.0669,
752
+ "step": 51500
753
+ },
754
+ {
755
+ "epoch": 2.4192797990136783,
756
+ "grad_norm": 1.76870596408844,
757
+ "learning_rate": 1.0322880803945287e-05,
758
+ "loss": 0.0704,
759
+ "step": 52000
760
+ },
761
+ {
762
+ "epoch": 2.442542104773425,
763
+ "grad_norm": 0.8901593089103699,
764
+ "learning_rate": 1.02298315809063e-05,
765
+ "loss": 0.0703,
766
+ "step": 52500
767
+ },
768
+ {
769
+ "epoch": 2.4658044105331722,
770
+ "grad_norm": 0.7651998400688171,
771
+ "learning_rate": 1.0136782357867312e-05,
772
+ "loss": 0.0685,
773
+ "step": 53000
774
+ },
775
+ {
776
+ "epoch": 2.489066716292919,
777
+ "grad_norm": 1.2652794122695923,
778
+ "learning_rate": 1.0043733134828325e-05,
779
+ "loss": 0.0653,
780
+ "step": 53500
781
+ },
782
+ {
783
+ "epoch": 2.5123290220526657,
784
+ "grad_norm": 1.768955111503601,
785
+ "learning_rate": 9.950683911789337e-06,
786
+ "loss": 0.0686,
787
+ "step": 54000
788
+ },
789
+ {
790
+ "epoch": 2.535591327812413,
791
+ "grad_norm": 1.6044102907180786,
792
+ "learning_rate": 9.85763468875035e-06,
793
+ "loss": 0.0676,
794
+ "step": 54500
795
+ },
796
+ {
797
+ "epoch": 2.5588536335721597,
798
+ "grad_norm": 1.808396816253662,
799
+ "learning_rate": 9.764585465711363e-06,
800
+ "loss": 0.0662,
801
+ "step": 55000
802
+ },
803
+ {
804
+ "epoch": 2.5821159393319064,
805
+ "grad_norm": 1.0778286457061768,
806
+ "learning_rate": 9.671536242672375e-06,
807
+ "loss": 0.0689,
808
+ "step": 55500
809
+ },
810
+ {
811
+ "epoch": 2.6053782450916536,
812
+ "grad_norm": 2.739319324493408,
813
+ "learning_rate": 9.578487019633386e-06,
814
+ "loss": 0.068,
815
+ "step": 56000
816
+ },
817
+ {
818
+ "epoch": 2.6286405508514004,
819
+ "grad_norm": 1.368030071258545,
820
+ "learning_rate": 9.485437796594399e-06,
821
+ "loss": 0.0655,
822
+ "step": 56500
823
+ },
824
+ {
825
+ "epoch": 2.651902856611147,
826
+ "grad_norm": 0.30945539474487305,
827
+ "learning_rate": 9.392388573555411e-06,
828
+ "loss": 0.0649,
829
+ "step": 57000
830
+ },
831
+ {
832
+ "epoch": 2.6751651623708943,
833
+ "grad_norm": 0.8296416997909546,
834
+ "learning_rate": 9.299339350516424e-06,
835
+ "loss": 0.0677,
836
+ "step": 57500
837
+ },
838
+ {
839
+ "epoch": 2.698427468130641,
840
+ "grad_norm": 1.1620192527770996,
841
+ "learning_rate": 9.206290127477437e-06,
842
+ "loss": 0.069,
843
+ "step": 58000
844
+ },
845
+ {
846
+ "epoch": 2.721689773890388,
847
+ "grad_norm": 0.9376591444015503,
848
+ "learning_rate": 9.11324090443845e-06,
849
+ "loss": 0.0662,
850
+ "step": 58500
851
+ },
852
+ {
853
+ "epoch": 2.744952079650135,
854
+ "grad_norm": 1.6231029033660889,
855
+ "learning_rate": 9.02019168139946e-06,
856
+ "loss": 0.0674,
857
+ "step": 59000
858
+ },
859
+ {
860
+ "epoch": 2.7682143854098817,
861
+ "grad_norm": 1.0340408086776733,
862
+ "learning_rate": 8.927142458360474e-06,
863
+ "loss": 0.0685,
864
+ "step": 59500
865
+ },
866
+ {
867
+ "epoch": 2.7914766911696285,
868
+ "grad_norm": 1.5797666311264038,
869
+ "learning_rate": 8.834093235321485e-06,
870
+ "loss": 0.065,
871
+ "step": 60000
872
+ },
873
+ {
874
+ "epoch": 2.8147389969293757,
875
+ "grad_norm": 0.9955604076385498,
876
+ "learning_rate": 8.741044012282498e-06,
877
+ "loss": 0.0682,
878
+ "step": 60500
879
+ },
880
+ {
881
+ "epoch": 2.8380013026891224,
882
+ "grad_norm": 2.2507500648498535,
883
+ "learning_rate": 8.64799478924351e-06,
884
+ "loss": 0.0651,
885
+ "step": 61000
886
+ },
887
+ {
888
+ "epoch": 2.861263608448869,
889
+ "grad_norm": 0.9272844791412354,
890
+ "learning_rate": 8.554945566204523e-06,
891
+ "loss": 0.0651,
892
+ "step": 61500
893
+ },
894
+ {
895
+ "epoch": 2.8845259142086164,
896
+ "grad_norm": 1.3886868953704834,
897
+ "learning_rate": 8.461896343165536e-06,
898
+ "loss": 0.066,
899
+ "step": 62000
900
+ },
901
+ {
902
+ "epoch": 2.907788219968363,
903
+ "grad_norm": 0.9660001397132874,
904
+ "learning_rate": 8.368847120126547e-06,
905
+ "loss": 0.0683,
906
+ "step": 62500
907
+ },
908
+ {
909
+ "epoch": 2.9310505257281103,
910
+ "grad_norm": 0.8844442963600159,
911
+ "learning_rate": 8.27579789708756e-06,
912
+ "loss": 0.0671,
913
+ "step": 63000
914
+ },
915
+ {
916
+ "epoch": 2.954312831487857,
917
+ "grad_norm": 2.407435417175293,
918
+ "learning_rate": 8.182748674048572e-06,
919
+ "loss": 0.0664,
920
+ "step": 63500
921
+ },
922
+ {
923
+ "epoch": 2.9775751372476043,
924
+ "grad_norm": 2.187854528427124,
925
+ "learning_rate": 8.089699451009585e-06,
926
+ "loss": 0.0666,
927
+ "step": 64000
928
+ },
929
+ {
930
+ "epoch": 3.0,
931
+ "eval_accuracy": 0.5649948822927329,
932
+ "eval_f1": 0.7591117292255597,
933
+ "eval_loss": 0.09460150450468063,
934
+ "eval_roc_auc": 0.8701062840131171,
935
+ "eval_runtime": 23.3037,
936
+ "eval_samples_per_second": 922.345,
937
+ "eval_steps_per_second": 115.304,
938
+ "step": 64482
939
+ },
940
+ {
941
+ "epoch": 3.000837443007351,
942
+ "grad_norm": 1.4572051763534546,
943
+ "learning_rate": 7.996650227970597e-06,
944
+ "loss": 0.0662,
945
+ "step": 64500
946
+ },
947
+ {
948
+ "epoch": 3.0240997487670978,
949
+ "grad_norm": 1.0595591068267822,
950
+ "learning_rate": 7.90360100493161e-06,
951
+ "loss": 0.0532,
952
+ "step": 65000
953
+ },
954
+ {
955
+ "epoch": 3.0473620545268445,
956
+ "grad_norm": 0.7926930785179138,
957
+ "learning_rate": 7.81055178189262e-06,
958
+ "loss": 0.0513,
959
+ "step": 65500
960
+ },
961
+ {
962
+ "epoch": 3.0706243602865917,
963
+ "grad_norm": 2.0223031044006348,
964
+ "learning_rate": 7.717502558853635e-06,
965
+ "loss": 0.056,
966
+ "step": 66000
967
+ },
968
+ {
969
+ "epoch": 3.0938866660463384,
970
+ "grad_norm": 1.3608500957489014,
971
+ "learning_rate": 7.624453335814647e-06,
972
+ "loss": 0.051,
973
+ "step": 66500
974
+ },
975
+ {
976
+ "epoch": 3.1171489718060856,
977
+ "grad_norm": 1.0539377927780151,
978
+ "learning_rate": 7.531404112775659e-06,
979
+ "loss": 0.0535,
980
+ "step": 67000
981
+ },
982
+ {
983
+ "epoch": 3.1404112775658324,
984
+ "grad_norm": 0.8993151187896729,
985
+ "learning_rate": 7.4383548897366704e-06,
986
+ "loss": 0.0529,
987
+ "step": 67500
988
+ },
989
+ {
990
+ "epoch": 3.163673583325579,
991
+ "grad_norm": 1.5115621089935303,
992
+ "learning_rate": 7.345305666697684e-06,
993
+ "loss": 0.0512,
994
+ "step": 68000
995
+ },
996
+ {
997
+ "epoch": 3.1869358890853263,
998
+ "grad_norm": 0.8718969225883484,
999
+ "learning_rate": 7.2522564436586965e-06,
1000
+ "loss": 0.0517,
1001
+ "step": 68500
1002
+ },
1003
+ {
1004
+ "epoch": 3.210198194845073,
1005
+ "grad_norm": 1.0791226625442505,
1006
+ "learning_rate": 7.159207220619708e-06,
1007
+ "loss": 0.0512,
1008
+ "step": 69000
1009
+ },
1010
+ {
1011
+ "epoch": 3.23346050060482,
1012
+ "grad_norm": 1.2362322807312012,
1013
+ "learning_rate": 7.06615799758072e-06,
1014
+ "loss": 0.0505,
1015
+ "step": 69500
1016
+ },
1017
+ {
1018
+ "epoch": 3.256722806364567,
1019
+ "grad_norm": 0.4878983497619629,
1020
+ "learning_rate": 6.9731087745417335e-06,
1021
+ "loss": 0.0523,
1022
+ "step": 70000
1023
+ },
1024
+ {
1025
+ "epoch": 3.2799851121243138,
1026
+ "grad_norm": 0.5156907439231873,
1027
+ "learning_rate": 6.880059551502746e-06,
1028
+ "loss": 0.0522,
1029
+ "step": 70500
1030
+ },
1031
+ {
1032
+ "epoch": 3.3032474178840605,
1033
+ "grad_norm": 1.0036829710006714,
1034
+ "learning_rate": 6.787010328463758e-06,
1035
+ "loss": 0.053,
1036
+ "step": 71000
1037
+ },
1038
+ {
1039
+ "epoch": 3.3265097236438077,
1040
+ "grad_norm": 1.9383690357208252,
1041
+ "learning_rate": 6.69396110542477e-06,
1042
+ "loss": 0.0524,
1043
+ "step": 71500
1044
+ },
1045
+ {
1046
+ "epoch": 3.3497720294035545,
1047
+ "grad_norm": 0.9468953609466553,
1048
+ "learning_rate": 6.600911882385782e-06,
1049
+ "loss": 0.0507,
1050
+ "step": 72000
1051
+ },
1052
+ {
1053
+ "epoch": 3.373034335163301,
1054
+ "grad_norm": 1.7953852415084839,
1055
+ "learning_rate": 6.507862659346795e-06,
1056
+ "loss": 0.0536,
1057
+ "step": 72500
1058
+ },
1059
+ {
1060
+ "epoch": 3.3962966409230484,
1061
+ "grad_norm": 3.1470677852630615,
1062
+ "learning_rate": 6.4148134363078075e-06,
1063
+ "loss": 0.0521,
1064
+ "step": 73000
1065
+ },
1066
+ {
1067
+ "epoch": 3.419558946682795,
1068
+ "grad_norm": 2.5121142864227295,
1069
+ "learning_rate": 6.321764213268819e-06,
1070
+ "loss": 0.0513,
1071
+ "step": 73500
1072
+ },
1073
+ {
1074
+ "epoch": 3.442821252442542,
1075
+ "grad_norm": 1.0771255493164062,
1076
+ "learning_rate": 6.228714990229832e-06,
1077
+ "loss": 0.0529,
1078
+ "step": 74000
1079
+ },
1080
+ {
1081
+ "epoch": 3.466083558202289,
1082
+ "grad_norm": 1.3458467721939087,
1083
+ "learning_rate": 6.1356657671908446e-06,
1084
+ "loss": 0.053,
1085
+ "step": 74500
1086
+ },
1087
+ {
1088
+ "epoch": 3.489345863962036,
1089
+ "grad_norm": 0.29975369572639465,
1090
+ "learning_rate": 6.042616544151857e-06,
1091
+ "loss": 0.0512,
1092
+ "step": 75000
1093
+ },
1094
+ {
1095
+ "epoch": 3.5126081697217826,
1096
+ "grad_norm": 1.2391622066497803,
1097
+ "learning_rate": 5.949567321112869e-06,
1098
+ "loss": 0.0519,
1099
+ "step": 75500
1100
+ },
1101
+ {
1102
+ "epoch": 3.5358704754815298,
1103
+ "grad_norm": 1.6513882875442505,
1104
+ "learning_rate": 5.8565180980738816e-06,
1105
+ "loss": 0.0529,
1106
+ "step": 76000
1107
+ },
1108
+ {
1109
+ "epoch": 3.5591327812412765,
1110
+ "grad_norm": 1.1643694639205933,
1111
+ "learning_rate": 5.763468875034894e-06,
1112
+ "loss": 0.0537,
1113
+ "step": 76500
1114
+ },
1115
+ {
1116
+ "epoch": 3.5823950870010233,
1117
+ "grad_norm": 1.9166603088378906,
1118
+ "learning_rate": 5.670419651995907e-06,
1119
+ "loss": 0.0508,
1120
+ "step": 77000
1121
+ },
1122
+ {
1123
+ "epoch": 3.6056573927607705,
1124
+ "grad_norm": 1.5334446430206299,
1125
+ "learning_rate": 5.5773704289569186e-06,
1126
+ "loss": 0.0514,
1127
+ "step": 77500
1128
+ },
1129
+ {
1130
+ "epoch": 3.628919698520517,
1131
+ "grad_norm": 2.500365972518921,
1132
+ "learning_rate": 5.48432120591793e-06,
1133
+ "loss": 0.0537,
1134
+ "step": 78000
1135
+ },
1136
+ {
1137
+ "epoch": 3.6521820042802644,
1138
+ "grad_norm": 0.8305968046188354,
1139
+ "learning_rate": 5.391271982878944e-06,
1140
+ "loss": 0.0546,
1141
+ "step": 78500
1142
+ },
1143
+ {
1144
+ "epoch": 3.675444310040011,
1145
+ "grad_norm": 1.3438687324523926,
1146
+ "learning_rate": 5.2982227598399564e-06,
1147
+ "loss": 0.0536,
1148
+ "step": 79000
1149
+ },
1150
+ {
1151
+ "epoch": 3.6987066157997583,
1152
+ "grad_norm": 1.1025956869125366,
1153
+ "learning_rate": 5.205173536800968e-06,
1154
+ "loss": 0.0546,
1155
+ "step": 79500
1156
+ },
1157
+ {
1158
+ "epoch": 3.721968921559505,
1159
+ "grad_norm": 1.6700533628463745,
1160
+ "learning_rate": 5.11212431376198e-06,
1161
+ "loss": 0.0533,
1162
+ "step": 80000
1163
+ },
1164
+ {
1165
+ "epoch": 3.745231227319252,
1166
+ "grad_norm": 0.8916147351264954,
1167
+ "learning_rate": 5.019075090722993e-06,
1168
+ "loss": 0.0516,
1169
+ "step": 80500
1170
+ },
1171
+ {
1172
+ "epoch": 3.768493533078999,
1173
+ "grad_norm": 1.7522839307785034,
1174
+ "learning_rate": 4.926025867684005e-06,
1175
+ "loss": 0.0537,
1176
+ "step": 81000
1177
+ },
1178
+ {
1179
+ "epoch": 3.791755838838746,
1180
+ "grad_norm": 1.4133764505386353,
1181
+ "learning_rate": 4.832976644645018e-06,
1182
+ "loss": 0.0542,
1183
+ "step": 81500
1184
+ },
1185
+ {
1186
+ "epoch": 3.8150181445984925,
1187
+ "grad_norm": 0.9128021001815796,
1188
+ "learning_rate": 4.73992742160603e-06,
1189
+ "loss": 0.0516,
1190
+ "step": 82000
1191
+ },
1192
+ {
1193
+ "epoch": 3.8382804503582397,
1194
+ "grad_norm": 2.4152848720550537,
1195
+ "learning_rate": 4.646878198567042e-06,
1196
+ "loss": 0.0532,
1197
+ "step": 82500
1198
+ },
1199
+ {
1200
+ "epoch": 3.8615427561179865,
1201
+ "grad_norm": 1.5950450897216797,
1202
+ "learning_rate": 4.553828975528055e-06,
1203
+ "loss": 0.0523,
1204
+ "step": 83000
1205
+ },
1206
+ {
1207
+ "epoch": 3.8848050618777332,
1208
+ "grad_norm": 1.5487794876098633,
1209
+ "learning_rate": 4.4607797524890675e-06,
1210
+ "loss": 0.054,
1211
+ "step": 83500
1212
+ },
1213
+ {
1214
+ "epoch": 3.9080673676374804,
1215
+ "grad_norm": 1.6051621437072754,
1216
+ "learning_rate": 4.367730529450079e-06,
1217
+ "loss": 0.052,
1218
+ "step": 84000
1219
+ },
1220
+ {
1221
+ "epoch": 3.931329673397227,
1222
+ "grad_norm": 1.2082515954971313,
1223
+ "learning_rate": 4.274681306411092e-06,
1224
+ "loss": 0.0512,
1225
+ "step": 84500
1226
+ },
1227
+ {
1228
+ "epoch": 3.954591979156974,
1229
+ "grad_norm": 1.1482079029083252,
1230
+ "learning_rate": 4.1816320833721045e-06,
1231
+ "loss": 0.0514,
1232
+ "step": 85000
1233
+ },
1234
+ {
1235
+ "epoch": 3.977854284916721,
1236
+ "grad_norm": 1.919583797454834,
1237
+ "learning_rate": 4.088582860333117e-06,
1238
+ "loss": 0.0543,
1239
+ "step": 85500
1240
+ },
1241
+ {
1242
+ "epoch": 4.0,
1243
+ "eval_accuracy": 0.5645761607890574,
1244
+ "eval_f1": 0.759579488098438,
1245
+ "eval_loss": 0.10311879962682724,
1246
+ "eval_roc_auc": 0.8744104988949349,
1247
+ "eval_runtime": 22.8736,
1248
+ "eval_samples_per_second": 939.686,
1249
+ "eval_steps_per_second": 117.472,
1250
+ "step": 85976
1251
+ },
1252
+ {
1253
+ "epoch": 4.001116590676467,
1254
+ "grad_norm": 0.9219486713409424,
1255
+ "learning_rate": 3.995533637294129e-06,
1256
+ "loss": 0.0531,
1257
+ "step": 86000
1258
+ },
1259
+ {
1260
+ "epoch": 4.024378896436215,
1261
+ "grad_norm": 0.5063530206680298,
1262
+ "learning_rate": 3.902484414255141e-06,
1263
+ "loss": 0.0431,
1264
+ "step": 86500
1265
+ },
1266
+ {
1267
+ "epoch": 4.047641202195962,
1268
+ "grad_norm": 0.9435988068580627,
1269
+ "learning_rate": 3.8094351912161537e-06,
1270
+ "loss": 0.0419,
1271
+ "step": 87000
1272
+ },
1273
+ {
1274
+ "epoch": 4.070903507955709,
1275
+ "grad_norm": 0.8546033501625061,
1276
+ "learning_rate": 3.716385968177166e-06,
1277
+ "loss": 0.0413,
1278
+ "step": 87500
1279
+ },
1280
+ {
1281
+ "epoch": 4.094165813715455,
1282
+ "grad_norm": 1.6249778270721436,
1283
+ "learning_rate": 3.6233367451381785e-06,
1284
+ "loss": 0.0429,
1285
+ "step": 88000
1286
+ },
1287
+ {
1288
+ "epoch": 4.1174281194752025,
1289
+ "grad_norm": 1.7502926588058472,
1290
+ "learning_rate": 3.5302875220991907e-06,
1291
+ "loss": 0.0395,
1292
+ "step": 88500
1293
+ },
1294
+ {
1295
+ "epoch": 4.14069042523495,
1296
+ "grad_norm": 3.170189380645752,
1297
+ "learning_rate": 3.4372382990602033e-06,
1298
+ "loss": 0.0431,
1299
+ "step": 89000
1300
+ },
1301
+ {
1302
+ "epoch": 4.163952730994696,
1303
+ "grad_norm": 1.2306873798370361,
1304
+ "learning_rate": 3.3441890760212155e-06,
1305
+ "loss": 0.0419,
1306
+ "step": 89500
1307
+ },
1308
+ {
1309
+ "epoch": 4.187215036754443,
1310
+ "grad_norm": 2.3752849102020264,
1311
+ "learning_rate": 3.251139852982228e-06,
1312
+ "loss": 0.0426,
1313
+ "step": 90000
1314
+ },
1315
+ {
1316
+ "epoch": 4.21047734251419,
1317
+ "grad_norm": 1.0530017614364624,
1318
+ "learning_rate": 3.1580906299432403e-06,
1319
+ "loss": 0.0406,
1320
+ "step": 90500
1321
+ },
1322
+ {
1323
+ "epoch": 4.233739648273937,
1324
+ "grad_norm": 1.7250635623931885,
1325
+ "learning_rate": 3.065041406904253e-06,
1326
+ "loss": 0.0431,
1327
+ "step": 91000
1328
+ },
1329
+ {
1330
+ "epoch": 4.257001954033684,
1331
+ "grad_norm": 0.6301759481430054,
1332
+ "learning_rate": 2.9719921838652647e-06,
1333
+ "loss": 0.0437,
1334
+ "step": 91500
1335
+ },
1336
+ {
1337
+ "epoch": 4.280264259793431,
1338
+ "grad_norm": 2.9674508571624756,
1339
+ "learning_rate": 2.8789429608262777e-06,
1340
+ "loss": 0.0429,
1341
+ "step": 92000
1342
+ },
1343
+ {
1344
+ "epoch": 4.303526565553177,
1345
+ "grad_norm": 1.3684778213500977,
1346
+ "learning_rate": 2.7858937377872895e-06,
1347
+ "loss": 0.0413,
1348
+ "step": 92500
1349
+ },
1350
+ {
1351
+ "epoch": 4.326788871312925,
1352
+ "grad_norm": 2.5620508193969727,
1353
+ "learning_rate": 2.692844514748302e-06,
1354
+ "loss": 0.0425,
1355
+ "step": 93000
1356
+ },
1357
+ {
1358
+ "epoch": 4.350051177072672,
1359
+ "grad_norm": 2.529858350753784,
1360
+ "learning_rate": 2.5997952917093143e-06,
1361
+ "loss": 0.0432,
1362
+ "step": 93500
1363
+ },
1364
+ {
1365
+ "epoch": 4.373313482832418,
1366
+ "grad_norm": 1.6359411478042603,
1367
+ "learning_rate": 2.5067460686703265e-06,
1368
+ "loss": 0.0437,
1369
+ "step": 94000
1370
+ },
1371
+ {
1372
+ "epoch": 4.396575788592165,
1373
+ "grad_norm": 1.6633356809616089,
1374
+ "learning_rate": 2.413696845631339e-06,
1375
+ "loss": 0.0415,
1376
+ "step": 94500
1377
+ },
1378
+ {
1379
+ "epoch": 4.419838094351912,
1380
+ "grad_norm": 0.9840025901794434,
1381
+ "learning_rate": 2.3206476225923518e-06,
1382
+ "loss": 0.0387,
1383
+ "step": 95000
1384
+ },
1385
+ {
1386
+ "epoch": 4.443100400111659,
1387
+ "grad_norm": 1.1913479566574097,
1388
+ "learning_rate": 2.227598399553364e-06,
1389
+ "loss": 0.0433,
1390
+ "step": 95500
1391
+ },
1392
+ {
1393
+ "epoch": 4.466362705871406,
1394
+ "grad_norm": 0.9769937992095947,
1395
+ "learning_rate": 2.1345491765143766e-06,
1396
+ "loss": 0.042,
1397
+ "step": 96000
1398
+ },
1399
+ {
1400
+ "epoch": 4.489625011631153,
1401
+ "grad_norm": 0.4699022173881531,
1402
+ "learning_rate": 2.0414999534753888e-06,
1403
+ "loss": 0.0416,
1404
+ "step": 96500
1405
+ },
1406
+ {
1407
+ "epoch": 4.512887317390899,
1408
+ "grad_norm": 1.7338500022888184,
1409
+ "learning_rate": 1.948450730436401e-06,
1410
+ "loss": 0.0422,
1411
+ "step": 97000
1412
+ },
1413
+ {
1414
+ "epoch": 4.536149623150647,
1415
+ "grad_norm": 2.9296669960021973,
1416
+ "learning_rate": 1.8554015073974132e-06,
1417
+ "loss": 0.041,
1418
+ "step": 97500
1419
+ },
1420
+ {
1421
+ "epoch": 4.559411928910394,
1422
+ "grad_norm": 2.108750820159912,
1423
+ "learning_rate": 1.7623522843584256e-06,
1424
+ "loss": 0.0406,
1425
+ "step": 98000
1426
+ },
1427
+ {
1428
+ "epoch": 4.58267423467014,
1429
+ "grad_norm": 1.38349449634552,
1430
+ "learning_rate": 1.669303061319438e-06,
1431
+ "loss": 0.0403,
1432
+ "step": 98500
1433
+ },
1434
+ {
1435
+ "epoch": 4.605936540429887,
1436
+ "grad_norm": 1.092578411102295,
1437
+ "learning_rate": 1.5762538382804504e-06,
1438
+ "loss": 0.0419,
1439
+ "step": 99000
1440
+ },
1441
+ {
1442
+ "epoch": 4.6291988461896345,
1443
+ "grad_norm": 2.6619553565979004,
1444
+ "learning_rate": 1.4832046152414628e-06,
1445
+ "loss": 0.0394,
1446
+ "step": 99500
1447
+ },
1448
+ {
1449
+ "epoch": 4.652461151949382,
1450
+ "grad_norm": 1.8110424280166626,
1451
+ "learning_rate": 1.3901553922024752e-06,
1452
+ "loss": 0.0407,
1453
+ "step": 100000
1454
+ },
1455
+ {
1456
+ "epoch": 4.675723457709128,
1457
+ "grad_norm": 1.2239103317260742,
1458
+ "learning_rate": 1.2971061691634876e-06,
1459
+ "loss": 0.0415,
1460
+ "step": 100500
1461
+ },
1462
+ {
1463
+ "epoch": 4.698985763468875,
1464
+ "grad_norm": 3.0208041667938232,
1465
+ "learning_rate": 1.2040569461245e-06,
1466
+ "loss": 0.0411,
1467
+ "step": 101000
1468
+ },
1469
+ {
1470
+ "epoch": 4.7222480692286215,
1471
+ "grad_norm": 1.5058140754699707,
1472
+ "learning_rate": 1.1110077230855124e-06,
1473
+ "loss": 0.0431,
1474
+ "step": 101500
1475
+ },
1476
+ {
1477
+ "epoch": 4.745510374988369,
1478
+ "grad_norm": 1.6732498407363892,
1479
+ "learning_rate": 1.0179585000465248e-06,
1480
+ "loss": 0.0408,
1481
+ "step": 102000
1482
+ },
1483
+ {
1484
+ "epoch": 4.768772680748116,
1485
+ "grad_norm": 2.15928053855896,
1486
+ "learning_rate": 9.249092770075371e-07,
1487
+ "loss": 0.0412,
1488
+ "step": 102500
1489
+ },
1490
+ {
1491
+ "epoch": 4.792034986507863,
1492
+ "grad_norm": 1.8211805820465088,
1493
+ "learning_rate": 8.318600539685494e-07,
1494
+ "loss": 0.0418,
1495
+ "step": 103000
1496
+ },
1497
+ {
1498
+ "epoch": 4.815297292267609,
1499
+ "grad_norm": 1.1392755508422852,
1500
+ "learning_rate": 7.388108309295617e-07,
1501
+ "loss": 0.0395,
1502
+ "step": 103500
1503
+ },
1504
+ {
1505
+ "epoch": 4.838559598027357,
1506
+ "grad_norm": 1.2640013694763184,
1507
+ "learning_rate": 6.457616078905741e-07,
1508
+ "loss": 0.0404,
1509
+ "step": 104000
1510
+ },
1511
+ {
1512
+ "epoch": 4.861821903787103,
1513
+ "grad_norm": 1.2413549423217773,
1514
+ "learning_rate": 5.527123848515865e-07,
1515
+ "loss": 0.0409,
1516
+ "step": 104500
1517
+ },
1518
+ {
1519
+ "epoch": 4.88508420954685,
1520
+ "grad_norm": 0.14875428378582,
1521
+ "learning_rate": 4.5966316181259895e-07,
1522
+ "loss": 0.0405,
1523
+ "step": 105000
1524
+ },
1525
+ {
1526
+ "epoch": 4.908346515306597,
1527
+ "grad_norm": 1.309793472290039,
1528
+ "learning_rate": 3.6661393877361125e-07,
1529
+ "loss": 0.041,
1530
+ "step": 105500
1531
+ },
1532
+ {
1533
+ "epoch": 4.9316088210663445,
1534
+ "grad_norm": 0.9020711779594421,
1535
+ "learning_rate": 2.7356471573462365e-07,
1536
+ "loss": 0.0402,
1537
+ "step": 106000
1538
+ },
1539
+ {
1540
+ "epoch": 4.954871126826091,
1541
+ "grad_norm": 1.875877857208252,
1542
+ "learning_rate": 1.80515492695636e-07,
1543
+ "loss": 0.0408,
1544
+ "step": 106500
1545
+ },
1546
+ {
1547
+ "epoch": 4.978133432585838,
1548
+ "grad_norm": 3.464327335357666,
1549
+ "learning_rate": 8.746626965664838e-08,
1550
+ "loss": 0.0407,
1551
+ "step": 107000
1552
+ },
1553
+ {
1554
+ "epoch": 5.0,
1555
+ "eval_accuracy": 0.5647157346236159,
1556
+ "eval_f1": 0.7594581273430615,
1557
+ "eval_loss": 0.1101851612329483,
1558
+ "eval_roc_auc": 0.8742792491333411,
1559
+ "eval_runtime": 22.5758,
1560
+ "eval_samples_per_second": 952.08,
1561
+ "eval_steps_per_second": 119.021,
1562
+ "step": 107470
1563
+ }
1564
+ ],
1565
+ "logging_steps": 500,
1566
+ "max_steps": 107470,
1567
+ "num_input_tokens_seen": 0,
1568
+ "num_train_epochs": 5,
1569
+ "save_steps": 500,
1570
+ "stateful_callbacks": {
1571
+ "TrainerControl": {
1572
+ "args": {
1573
+ "should_epoch_stop": false,
1574
+ "should_evaluate": false,
1575
+ "should_log": false,
1576
+ "should_save": true,
1577
+ "should_training_stop": true
1578
+ },
1579
+ "attributes": {}
1580
+ }
1581
+ },
1582
+ "total_flos": 5.656124322596352e+16,
1583
+ "train_batch_size": 8,
1584
+ "trial_name": null,
1585
+ "trial_params": null
1586
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f09a088fc659036909375538a041709b83ecfa3173e60c7388d9eccad28cbfbe
3
+ size 5176
vocab.txt ADDED
The diff for this file is too large to render. See raw diff