ZhangYunchenY commited on
Commit
fbaf927
·
1 Parent(s): 813ac85

[Model] roberta-base-stsb

Browse files
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/mnt/lustre/zhangyunchen/transformers/roberta-base",
3
+ "architectures": [
4
+ "RobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "finetuning_task": "stsb",
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 768,
14
+ "id2label": {
15
+ "0": "LABEL_0"
16
+ },
17
+ "initializer_range": 0.02,
18
+ "intermediate_size": 3072,
19
+ "label2id": {
20
+ "LABEL_0": 0
21
+ },
22
+ "layer_norm_eps": 1e-05,
23
+ "max_position_embeddings": 514,
24
+ "model_type": "roberta",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "pad_token_id": 1,
28
+ "position_embedding_type": "absolute",
29
+ "problem_type": "regression",
30
+ "torch_dtype": "float32",
31
+ "transformers_version": "4.16.2",
32
+ "type_vocab_size": 1,
33
+ "use_cache": true,
34
+ "vocab_size": 50265
35
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a8a493a7d39eb568f9b195a446a70c364e5af1a95ae6b1f642d6ff28808524
3
+ size 997298211
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d53ad7964c0be3d02151eed8eee39491de64f6ba9f46a35685cda0c150160597
3
+ size 498676627
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dba11b9dc46ca4213c43bd2be071fbcd910ded8a0373affa5542585650081e02
3
+ size 14659
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d03ed95b8fbea10e20c6fbe6b4d37782180f1f48fec05fcebf936fd45bf5a1a6
3
+ size 623
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": {"content": "<mask>", "single_word": false, "lstrip": true, "rstrip": false, "normalized": false}}
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "add_prefix_space": false, "errors": "replace", "sep_token": "</s>", "cls_token": "<s>", "pad_token": "<pad>", "mask_token": "<mask>", "trim_offsets": true, "special_tokens_map_file": null, "name_or_path": "/mnt/lustre/zhangyunchen/transformers/roberta-base", "tokenizer_class": "RobertaTokenizer"}
trainer_state.json ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.9103662307301734,
3
+ "best_model_checkpoint": "./fp32_3e_5/models/stsb-roberta-base/checkpoint-1600",
4
+ "epoch": 8.88888888888889,
5
+ "global_step": 1600,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.56,
12
+ "learning_rate": 2.777777777777778e-05,
13
+ "loss": 3.7756,
14
+ "step": 100
15
+ },
16
+ {
17
+ "epoch": 0.56,
18
+ "eval_combined_score": 0.8535264998873677,
19
+ "eval_loss": 0.6678845286369324,
20
+ "eval_pearson": 0.852268663307789,
21
+ "eval_runtime": 7.0932,
22
+ "eval_samples_per_second": 211.47,
23
+ "eval_spearmanr": 0.8547843364669463,
24
+ "eval_steps_per_second": 6.626,
25
+ "step": 100
26
+ },
27
+ {
28
+ "epoch": 1.11,
29
+ "learning_rate": 2.8368794326241135e-05,
30
+ "loss": 0.6723,
31
+ "step": 200
32
+ },
33
+ {
34
+ "epoch": 1.11,
35
+ "eval_combined_score": 0.8880544841766463,
36
+ "eval_loss": 0.5207411050796509,
37
+ "eval_pearson": 0.888720755423408,
38
+ "eval_runtime": 6.6011,
39
+ "eval_samples_per_second": 227.235,
40
+ "eval_spearmanr": 0.8873882129298847,
41
+ "eval_steps_per_second": 7.12,
42
+ "step": 200
43
+ },
44
+ {
45
+ "epoch": 1.67,
46
+ "learning_rate": 2.6595744680851064e-05,
47
+ "loss": 0.5148,
48
+ "step": 300
49
+ },
50
+ {
51
+ "epoch": 1.67,
52
+ "eval_combined_score": 0.8963770244622205,
53
+ "eval_loss": 0.4456341564655304,
54
+ "eval_pearson": 0.8974699069640754,
55
+ "eval_runtime": 4.0841,
56
+ "eval_samples_per_second": 367.278,
57
+ "eval_spearmanr": 0.8952841419603657,
58
+ "eval_steps_per_second": 11.508,
59
+ "step": 300
60
+ },
61
+ {
62
+ "epoch": 2.22,
63
+ "learning_rate": 2.4822695035460992e-05,
64
+ "loss": 0.3898,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 2.22,
69
+ "eval_combined_score": 0.900132405880169,
70
+ "eval_loss": 0.44346898794174194,
71
+ "eval_pearson": 0.9017121586504461,
72
+ "eval_runtime": 3.8243,
73
+ "eval_samples_per_second": 392.231,
74
+ "eval_spearmanr": 0.898552653109892,
75
+ "eval_steps_per_second": 12.29,
76
+ "step": 400
77
+ },
78
+ {
79
+ "epoch": 2.78,
80
+ "learning_rate": 2.3049645390070924e-05,
81
+ "loss": 0.3167,
82
+ "step": 500
83
+ },
84
+ {
85
+ "epoch": 2.78,
86
+ "eval_combined_score": 0.9007290034038564,
87
+ "eval_loss": 0.48722508549690247,
88
+ "eval_pearson": 0.9027466027895519,
89
+ "eval_runtime": 3.7737,
90
+ "eval_samples_per_second": 397.491,
91
+ "eval_spearmanr": 0.898711404018161,
92
+ "eval_steps_per_second": 12.455,
93
+ "step": 500
94
+ },
95
+ {
96
+ "epoch": 3.33,
97
+ "learning_rate": 2.1276595744680852e-05,
98
+ "loss": 0.253,
99
+ "step": 600
100
+ },
101
+ {
102
+ "epoch": 3.33,
103
+ "eval_combined_score": 0.9052751739076982,
104
+ "eval_loss": 0.4403521716594696,
105
+ "eval_pearson": 0.907271329521877,
106
+ "eval_runtime": 5.4187,
107
+ "eval_samples_per_second": 276.821,
108
+ "eval_spearmanr": 0.9032790182935194,
109
+ "eval_steps_per_second": 8.674,
110
+ "step": 600
111
+ },
112
+ {
113
+ "epoch": 3.89,
114
+ "learning_rate": 1.950354609929078e-05,
115
+ "loss": 0.2197,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 3.89,
120
+ "eval_combined_score": 0.9059989545154765,
121
+ "eval_loss": 0.40818288922309875,
122
+ "eval_pearson": 0.9078220385222855,
123
+ "eval_runtime": 3.7738,
124
+ "eval_samples_per_second": 397.48,
125
+ "eval_spearmanr": 0.9041758705086674,
126
+ "eval_steps_per_second": 12.454,
127
+ "step": 700
128
+ },
129
+ {
130
+ "epoch": 4.44,
131
+ "learning_rate": 1.773049645390071e-05,
132
+ "loss": 0.1904,
133
+ "step": 800
134
+ },
135
+ {
136
+ "epoch": 4.44,
137
+ "eval_combined_score": 0.9059072653893803,
138
+ "eval_loss": 0.41569381952285767,
139
+ "eval_pearson": 0.9063948213345204,
140
+ "eval_runtime": 4.1987,
141
+ "eval_samples_per_second": 357.256,
142
+ "eval_spearmanr": 0.9054197094442402,
143
+ "eval_steps_per_second": 11.194,
144
+ "step": 800
145
+ },
146
+ {
147
+ "epoch": 5.0,
148
+ "learning_rate": 1.5957446808510637e-05,
149
+ "loss": 0.1708,
150
+ "step": 900
151
+ },
152
+ {
153
+ "epoch": 5.0,
154
+ "eval_combined_score": 0.906494497393801,
155
+ "eval_loss": 0.46862491965293884,
156
+ "eval_pearson": 0.9083108072067478,
157
+ "eval_runtime": 4.0762,
158
+ "eval_samples_per_second": 367.992,
159
+ "eval_spearmanr": 0.904678187580854,
160
+ "eval_steps_per_second": 11.53,
161
+ "step": 900
162
+ },
163
+ {
164
+ "epoch": 5.56,
165
+ "learning_rate": 1.4184397163120568e-05,
166
+ "loss": 0.1363,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 5.56,
171
+ "eval_combined_score": 0.9055447457890553,
172
+ "eval_loss": 0.4190002977848053,
173
+ "eval_pearson": 0.9069385161043947,
174
+ "eval_runtime": 3.7515,
175
+ "eval_samples_per_second": 399.845,
176
+ "eval_spearmanr": 0.9041509754737158,
177
+ "eval_steps_per_second": 12.528,
178
+ "step": 1000
179
+ },
180
+ {
181
+ "epoch": 6.11,
182
+ "learning_rate": 1.2411347517730496e-05,
183
+ "loss": 0.1291,
184
+ "step": 1100
185
+ },
186
+ {
187
+ "epoch": 6.11,
188
+ "eval_combined_score": 0.9081783832953019,
189
+ "eval_loss": 0.4186760187149048,
190
+ "eval_pearson": 0.9095907875352112,
191
+ "eval_runtime": 4.6081,
192
+ "eval_samples_per_second": 325.513,
193
+ "eval_spearmanr": 0.9067659790553926,
194
+ "eval_steps_per_second": 10.199,
195
+ "step": 1100
196
+ },
197
+ {
198
+ "epoch": 6.67,
199
+ "learning_rate": 1.0638297872340426e-05,
200
+ "loss": 0.1102,
201
+ "step": 1200
202
+ },
203
+ {
204
+ "epoch": 6.67,
205
+ "eval_combined_score": 0.907499747827053,
206
+ "eval_loss": 0.4207873046398163,
207
+ "eval_pearson": 0.9091866720599195,
208
+ "eval_runtime": 4.3818,
209
+ "eval_samples_per_second": 342.325,
210
+ "eval_spearmanr": 0.9058128235941866,
211
+ "eval_steps_per_second": 10.726,
212
+ "step": 1200
213
+ },
214
+ {
215
+ "epoch": 7.22,
216
+ "learning_rate": 8.865248226950355e-06,
217
+ "loss": 0.0984,
218
+ "step": 1300
219
+ },
220
+ {
221
+ "epoch": 7.22,
222
+ "eval_combined_score": 0.906452303345736,
223
+ "eval_loss": 0.4338637590408325,
224
+ "eval_pearson": 0.907816988223447,
225
+ "eval_runtime": 3.7556,
226
+ "eval_samples_per_second": 399.402,
227
+ "eval_spearmanr": 0.905087618468025,
228
+ "eval_steps_per_second": 12.515,
229
+ "step": 1300
230
+ },
231
+ {
232
+ "epoch": 7.78,
233
+ "learning_rate": 7.092198581560284e-06,
234
+ "loss": 0.092,
235
+ "step": 1400
236
+ },
237
+ {
238
+ "epoch": 7.78,
239
+ "eval_combined_score": 0.9068694360771773,
240
+ "eval_loss": 0.418102502822876,
241
+ "eval_pearson": 0.9085830258812985,
242
+ "eval_runtime": 4.0479,
243
+ "eval_samples_per_second": 370.565,
244
+ "eval_spearmanr": 0.9051558462730561,
245
+ "eval_steps_per_second": 11.611,
246
+ "step": 1400
247
+ },
248
+ {
249
+ "epoch": 8.33,
250
+ "learning_rate": 5.319148936170213e-06,
251
+ "loss": 0.0834,
252
+ "step": 1500
253
+ },
254
+ {
255
+ "epoch": 8.33,
256
+ "eval_combined_score": 0.9068391544864225,
257
+ "eval_loss": 0.42239323258399963,
258
+ "eval_pearson": 0.9082395585902145,
259
+ "eval_runtime": 4.2304,
260
+ "eval_samples_per_second": 354.575,
261
+ "eval_spearmanr": 0.9054387503826306,
262
+ "eval_steps_per_second": 11.11,
263
+ "step": 1500
264
+ },
265
+ {
266
+ "epoch": 8.89,
267
+ "learning_rate": 3.546099290780142e-06,
268
+ "loss": 0.0798,
269
+ "step": 1600
270
+ },
271
+ {
272
+ "epoch": 8.89,
273
+ "eval_combined_score": 0.9087789493455298,
274
+ "eval_loss": 0.3950077295303345,
275
+ "eval_pearson": 0.9103662307301734,
276
+ "eval_runtime": 3.8353,
277
+ "eval_samples_per_second": 391.105,
278
+ "eval_spearmanr": 0.9071916679608862,
279
+ "eval_steps_per_second": 12.255,
280
+ "step": 1600
281
+ }
282
+ ],
283
+ "max_steps": 1800,
284
+ "num_train_epochs": 10,
285
+ "total_flos": 3362002879051776.0,
286
+ "trial_name": null,
287
+ "trial_params": null
288
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97367c6cd3e15f25c0f4cd81a95b79ed29b32469d7c3c3f416ee6cfd5ef048f0
3
+ size 3055
vocab.json ADDED
The diff for this file is too large to render. See raw diff