eneSadi commited on
Commit
155b689
·
verified ·
1 Parent(s): f768ee0

Upload 15 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "xlm-roberta-large",
3
+ "architectures": [
4
+ "XLMRobertaModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.46.3",
25
+ "type_vocab_size": 1,
26
+ "use_cache": true,
27
+ "vocab_size": 250002
28
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.3.1",
4
+ "transformers": "4.46.3",
5
+ "pytorch": "2.5.1+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f29e5eff32af3a7475105b07bb3ab35f91bc5f26b396e6c0054ae215a2d80934
3
+ size 2239607176
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ }
14
+ ]
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64913fc7cf76e57473f1f3dc7fcb428bfff02ced7fac9575421f95b18356f17a
3
+ size 14960
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f9b522c44df741b94051d80458d3b917419d0a5b17ffb0d72bc5ecd64b46af5
3
+ size 15024
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7a2eea9880115fd39e7d18dd8cfc56a010ca6c0afae3228c7b5338323b2f33d
3
+ size 15024
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c67727acaaaca27d55b3d840273b4d3cff0c9ff7202c37ecf11f259482d03bc
3
+ size 15024
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3642978a9acbeda084134473869c983a3152a357cc28cd346ff0a5d3373e6435
3
+ size 1064
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 512,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
3
+ size 17082987
tokenizer_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 512,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "tokenizer_class": "XLMRobertaTokenizer",
53
+ "unk_token": "<unk>"
54
+ }
trainer_state.json ADDED
@@ -0,0 +1,808 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 600,
6
+ "global_step": 19724,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03041979314540661,
13
+ "grad_norm": 18.901899337768555,
14
+ "learning_rate": 7.602635580334516e-06,
15
+ "loss": 6.9711,
16
+ "step": 300
17
+ },
18
+ {
19
+ "epoch": 0.06083958629081322,
20
+ "grad_norm": 10.027973175048828,
21
+ "learning_rate": 1.5205271160669032e-05,
22
+ "loss": 6.6782,
23
+ "step": 600
24
+ },
25
+ {
26
+ "epoch": 0.06083958629081322,
27
+ "eval_loss": 6.566777229309082,
28
+ "eval_runtime": 137.8826,
29
+ "eval_samples_per_second": 722.803,
30
+ "eval_steps_per_second": 3.771,
31
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8221966547977905,
32
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.8553459588247386,
33
+ "step": 600
34
+ },
35
+ {
36
+ "epoch": 0.09125937943621984,
37
+ "grad_norm": 7.943920612335205,
38
+ "learning_rate": 2.2807906741003547e-05,
39
+ "loss": 6.5629,
40
+ "step": 900
41
+ },
42
+ {
43
+ "epoch": 0.12167917258162644,
44
+ "grad_norm": 14.02817440032959,
45
+ "learning_rate": 3.0410542321338064e-05,
46
+ "loss": 6.5257,
47
+ "step": 1200
48
+ },
49
+ {
50
+ "epoch": 0.12167917258162644,
51
+ "eval_loss": 6.480606555938721,
52
+ "eval_runtime": 137.5885,
53
+ "eval_samples_per_second": 724.348,
54
+ "eval_steps_per_second": 3.779,
55
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8389107234301584,
56
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.8806960493934014,
57
+ "step": 1200
58
+ },
59
+ {
60
+ "epoch": 0.15209896572703305,
61
+ "grad_norm": 6.915318489074707,
62
+ "learning_rate": 3.8013177901672584e-05,
63
+ "loss": 6.5118,
64
+ "step": 1500
65
+ },
66
+ {
67
+ "epoch": 0.18251875887243968,
68
+ "grad_norm": 8.075170516967773,
69
+ "learning_rate": 4.5615813482007094e-05,
70
+ "loss": 6.5113,
71
+ "step": 1800
72
+ },
73
+ {
74
+ "epoch": 0.18251875887243968,
75
+ "eval_loss": 6.4504618644714355,
76
+ "eval_runtime": 137.6975,
77
+ "eval_samples_per_second": 723.775,
78
+ "eval_steps_per_second": 3.776,
79
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8409337903166615,
80
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.8860494474021995,
81
+ "step": 1800
82
+ },
83
+ {
84
+ "epoch": 0.21293855201784628,
85
+ "grad_norm": 7.645256996154785,
86
+ "learning_rate": 4.9642273674722556e-05,
87
+ "loss": 6.4985,
88
+ "step": 2100
89
+ },
90
+ {
91
+ "epoch": 0.2433583451632529,
92
+ "grad_norm": 9.04224681854248,
93
+ "learning_rate": 4.879725085910653e-05,
94
+ "loss": 6.4647,
95
+ "step": 2400
96
+ },
97
+ {
98
+ "epoch": 0.2433583451632529,
99
+ "eval_loss": 6.431312084197998,
100
+ "eval_runtime": 138.1086,
101
+ "eval_samples_per_second": 721.621,
102
+ "eval_steps_per_second": 3.765,
103
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8624197414681364,
104
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.8930269904775303,
105
+ "step": 2400
106
+ },
107
+ {
108
+ "epoch": 0.2737781383086595,
109
+ "grad_norm": 8.209064483642578,
110
+ "learning_rate": 4.795222804349051e-05,
111
+ "loss": 6.4521,
112
+ "step": 2700
113
+ },
114
+ {
115
+ "epoch": 0.3041979314540661,
116
+ "grad_norm": 9.494904518127441,
117
+ "learning_rate": 4.710720522787449e-05,
118
+ "loss": 6.433,
119
+ "step": 3000
120
+ },
121
+ {
122
+ "epoch": 0.3041979314540661,
123
+ "eval_loss": 6.395111083984375,
124
+ "eval_runtime": 137.451,
125
+ "eval_samples_per_second": 725.073,
126
+ "eval_steps_per_second": 3.783,
127
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8646640222074062,
128
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9000406008439981,
129
+ "step": 3000
130
+ },
131
+ {
132
+ "epoch": 0.3346177245994727,
133
+ "grad_norm": 47.823909759521484,
134
+ "learning_rate": 4.626218241225847e-05,
135
+ "loss": 6.425,
136
+ "step": 3300
137
+ },
138
+ {
139
+ "epoch": 0.36503751774487936,
140
+ "grad_norm": 7.535511493682861,
141
+ "learning_rate": 4.541715959664244e-05,
142
+ "loss": 6.4997,
143
+ "step": 3600
144
+ },
145
+ {
146
+ "epoch": 0.36503751774487936,
147
+ "eval_loss": 6.358321666717529,
148
+ "eval_runtime": 137.1665,
149
+ "eval_samples_per_second": 726.577,
150
+ "eval_steps_per_second": 3.791,
151
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8730759198834104,
152
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9086688126662049,
153
+ "step": 3600
154
+ },
155
+ {
156
+ "epoch": 0.39545731089028596,
157
+ "grad_norm": 10.910676956176758,
158
+ "learning_rate": 4.4572136781026426e-05,
159
+ "loss": 6.4025,
160
+ "step": 3900
161
+ },
162
+ {
163
+ "epoch": 0.42587710403569257,
164
+ "grad_norm": 10.022330284118652,
165
+ "learning_rate": 4.37271139654104e-05,
166
+ "loss": 6.3871,
167
+ "step": 4200
168
+ },
169
+ {
170
+ "epoch": 0.42587710403569257,
171
+ "eval_loss": 6.344127655029297,
172
+ "eval_runtime": 138.1255,
173
+ "eval_samples_per_second": 721.532,
174
+ "eval_steps_per_second": 3.765,
175
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8825601723202625,
176
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9130197347529897,
177
+ "step": 4200
178
+ },
179
+ {
180
+ "epoch": 0.45629689718109917,
181
+ "grad_norm": 9.585289001464844,
182
+ "learning_rate": 4.288209114979438e-05,
183
+ "loss": 6.3839,
184
+ "step": 4500
185
+ },
186
+ {
187
+ "epoch": 0.4867166903265058,
188
+ "grad_norm": 10.677062034606934,
189
+ "learning_rate": 4.203706833417836e-05,
190
+ "loss": 6.3572,
191
+ "step": 4800
192
+ },
193
+ {
194
+ "epoch": 0.4867166903265058,
195
+ "eval_loss": 6.322529315948486,
196
+ "eval_runtime": 138.9975,
197
+ "eval_samples_per_second": 717.006,
198
+ "eval_steps_per_second": 3.741,
199
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8822280482311033,
200
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9164128582462162,
201
+ "step": 4800
202
+ },
203
+ {
204
+ "epoch": 0.5171364834719124,
205
+ "grad_norm": 7.5291948318481445,
206
+ "learning_rate": 4.1192045518562337e-05,
207
+ "loss": 6.3551,
208
+ "step": 5100
209
+ },
210
+ {
211
+ "epoch": 0.547556276617319,
212
+ "grad_norm": 110.76056671142578,
213
+ "learning_rate": 4.034702270294631e-05,
214
+ "loss": 6.3389,
215
+ "step": 5400
216
+ },
217
+ {
218
+ "epoch": 0.547556276617319,
219
+ "eval_loss": 6.453874111175537,
220
+ "eval_runtime": 138.6084,
221
+ "eval_samples_per_second": 719.018,
222
+ "eval_steps_per_second": 3.752,
223
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8527196112195143,
224
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.8865816143700719,
225
+ "step": 5400
226
+ },
227
+ {
228
+ "epoch": 0.5779760697627256,
229
+ "grad_norm": 7.72743034362793,
230
+ "learning_rate": 3.9501999887330295e-05,
231
+ "loss": 6.342,
232
+ "step": 5700
233
+ },
234
+ {
235
+ "epoch": 0.6083958629081322,
236
+ "grad_norm": 7.7697434425354,
237
+ "learning_rate": 3.865697707171427e-05,
238
+ "loss": 6.3264,
239
+ "step": 6000
240
+ },
241
+ {
242
+ "epoch": 0.6083958629081322,
243
+ "eval_loss": 6.28953218460083,
244
+ "eval_runtime": 138.8082,
245
+ "eval_samples_per_second": 717.984,
246
+ "eval_steps_per_second": 3.746,
247
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.886832364356628,
248
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9225845620423561,
249
+ "step": 6000
250
+ },
251
+ {
252
+ "epoch": 0.6388156560535388,
253
+ "grad_norm": 10.513375282287598,
254
+ "learning_rate": 3.781195425609825e-05,
255
+ "loss": 6.3174,
256
+ "step": 6300
257
+ },
258
+ {
259
+ "epoch": 0.6692354491989454,
260
+ "grad_norm": 8.724417686462402,
261
+ "learning_rate": 3.696693144048223e-05,
262
+ "loss": 6.3053,
263
+ "step": 6600
264
+ },
265
+ {
266
+ "epoch": 0.6692354491989454,
267
+ "eval_loss": 6.261702060699463,
268
+ "eval_runtime": 138.9377,
269
+ "eval_samples_per_second": 717.314,
270
+ "eval_steps_per_second": 3.743,
271
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.8988111510184248,
272
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9276381690737614,
273
+ "step": 6600
274
+ },
275
+ {
276
+ "epoch": 0.6996552423443521,
277
+ "grad_norm": 7.966050148010254,
278
+ "learning_rate": 3.6121908624866206e-05,
279
+ "loss": 6.2845,
280
+ "step": 6900
281
+ },
282
+ {
283
+ "epoch": 0.7300750354897587,
284
+ "grad_norm": 9.207416534423828,
285
+ "learning_rate": 3.527688580925019e-05,
286
+ "loss": 6.2682,
287
+ "step": 7200
288
+ },
289
+ {
290
+ "epoch": 0.7300750354897587,
291
+ "eval_loss": 6.230149745941162,
292
+ "eval_runtime": 138.8391,
293
+ "eval_samples_per_second": 717.824,
294
+ "eval_steps_per_second": 3.745,
295
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9008172441807218,
296
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.931930159250373,
297
+ "step": 7200
298
+ },
299
+ {
300
+ "epoch": 0.7604948286351653,
301
+ "grad_norm": 9.360489845275879,
302
+ "learning_rate": 3.4431862993634165e-05,
303
+ "loss": 6.2636,
304
+ "step": 7500
305
+ },
306
+ {
307
+ "epoch": 0.7909146217805719,
308
+ "grad_norm": 7.827134609222412,
309
+ "learning_rate": 3.358684017801814e-05,
310
+ "loss": 6.2583,
311
+ "step": 7800
312
+ },
313
+ {
314
+ "epoch": 0.7909146217805719,
315
+ "eval_loss": 6.21486234664917,
316
+ "eval_runtime": 138.9559,
317
+ "eval_samples_per_second": 717.22,
318
+ "eval_steps_per_second": 3.742,
319
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9026087567445202,
320
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9353704768738569,
321
+ "step": 7800
322
+ },
323
+ {
324
+ "epoch": 0.8213344149259785,
325
+ "grad_norm": 30.614959716796875,
326
+ "learning_rate": 3.2741817362402124e-05,
327
+ "loss": 6.2444,
328
+ "step": 8100
329
+ },
330
+ {
331
+ "epoch": 0.8517542080713851,
332
+ "grad_norm": 31.50200843811035,
333
+ "learning_rate": 3.18967945467861e-05,
334
+ "loss": 6.2486,
335
+ "step": 8400
336
+ },
337
+ {
338
+ "epoch": 0.8517542080713851,
339
+ "eval_loss": 6.202221870422363,
340
+ "eval_runtime": 138.5186,
341
+ "eval_samples_per_second": 719.484,
342
+ "eval_steps_per_second": 3.754,
343
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9057294515964277,
344
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9369235606209017,
345
+ "step": 8400
346
+ },
347
+ {
348
+ "epoch": 0.8821740012167917,
349
+ "grad_norm": 14.758610725402832,
350
+ "learning_rate": 3.1051771731170076e-05,
351
+ "loss": 6.2403,
352
+ "step": 8700
353
+ },
354
+ {
355
+ "epoch": 0.9125937943621983,
356
+ "grad_norm": 10.743316650390625,
357
+ "learning_rate": 3.0206748915554055e-05,
358
+ "loss": 6.2264,
359
+ "step": 9000
360
+ },
361
+ {
362
+ "epoch": 0.9125937943621983,
363
+ "eval_loss": 6.1864914894104,
364
+ "eval_runtime": 137.8506,
365
+ "eval_samples_per_second": 722.971,
366
+ "eval_steps_per_second": 3.772,
367
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9103090365593022,
368
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9399249329773943,
369
+ "step": 9000
370
+ },
371
+ {
372
+ "epoch": 0.943013587507605,
373
+ "grad_norm": 10.912450790405273,
374
+ "learning_rate": 2.936172609993803e-05,
375
+ "loss": 6.228,
376
+ "step": 9300
377
+ },
378
+ {
379
+ "epoch": 0.9734333806530115,
380
+ "grad_norm": 10.816086769104004,
381
+ "learning_rate": 2.851670328432201e-05,
382
+ "loss": 6.2183,
383
+ "step": 9600
384
+ },
385
+ {
386
+ "epoch": 0.9734333806530115,
387
+ "eval_loss": 6.168646812438965,
388
+ "eval_runtime": 138.3732,
389
+ "eval_samples_per_second": 720.241,
390
+ "eval_steps_per_second": 3.758,
391
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9120469924143964,
392
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9418643543713353,
393
+ "step": 9600
394
+ },
395
+ {
396
+ "epoch": 1.0038531737984182,
397
+ "grad_norm": 7.334249496459961,
398
+ "learning_rate": 2.767168046870599e-05,
399
+ "loss": 6.1942,
400
+ "step": 9900
401
+ },
402
+ {
403
+ "epoch": 1.0342729669438249,
404
+ "grad_norm": 11.221212387084961,
405
+ "learning_rate": 2.682665765308997e-05,
406
+ "loss": 6.1156,
407
+ "step": 10200
408
+ },
409
+ {
410
+ "epoch": 1.0342729669438249,
411
+ "eval_loss": 6.172195911407471,
412
+ "eval_runtime": 140.5272,
413
+ "eval_samples_per_second": 709.201,
414
+ "eval_steps_per_second": 3.7,
415
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9150733613138559,
416
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.944458159317982,
417
+ "step": 10200
418
+ },
419
+ {
420
+ "epoch": 1.0646927600892313,
421
+ "grad_norm": 10.197308540344238,
422
+ "learning_rate": 2.5981634837473945e-05,
423
+ "loss": 6.1092,
424
+ "step": 10500
425
+ },
426
+ {
427
+ "epoch": 1.095112553234638,
428
+ "grad_norm": 10.895206451416016,
429
+ "learning_rate": 2.5136612021857924e-05,
430
+ "loss": 6.115,
431
+ "step": 10800
432
+ },
433
+ {
434
+ "epoch": 1.095112553234638,
435
+ "eval_loss": 6.155474662780762,
436
+ "eval_runtime": 138.8551,
437
+ "eval_samples_per_second": 717.741,
438
+ "eval_steps_per_second": 3.745,
439
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9189361970149964,
440
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9454253136066088,
441
+ "step": 10800
442
+ },
443
+ {
444
+ "epoch": 1.1255323463800446,
445
+ "grad_norm": 7.99360466003418,
446
+ "learning_rate": 2.42915892062419e-05,
447
+ "loss": 6.1073,
448
+ "step": 11100
449
+ },
450
+ {
451
+ "epoch": 1.1559521395254513,
452
+ "grad_norm": 8.214856147766113,
453
+ "learning_rate": 2.3446566390625883e-05,
454
+ "loss": 6.0948,
455
+ "step": 11400
456
+ },
457
+ {
458
+ "epoch": 1.1559521395254513,
459
+ "eval_loss": 6.126833915710449,
460
+ "eval_runtime": 138.2296,
461
+ "eval_samples_per_second": 720.989,
462
+ "eval_steps_per_second": 3.762,
463
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.920760214361107,
464
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9479362714648124,
465
+ "step": 11400
466
+ },
467
+ {
468
+ "epoch": 1.1863719326708577,
469
+ "grad_norm": 8.69244384765625,
470
+ "learning_rate": 2.260154357500986e-05,
471
+ "loss": 6.0847,
472
+ "step": 11700
473
+ },
474
+ {
475
+ "epoch": 1.2167917258162644,
476
+ "grad_norm": 8.387072563171387,
477
+ "learning_rate": 2.175652075939384e-05,
478
+ "loss": 6.0847,
479
+ "step": 12000
480
+ },
481
+ {
482
+ "epoch": 1.2167917258162644,
483
+ "eval_loss": 6.112495422363281,
484
+ "eval_runtime": 138.8912,
485
+ "eval_samples_per_second": 717.554,
486
+ "eval_steps_per_second": 3.744,
487
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9210961226847519,
488
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9499128638631187,
489
+ "step": 12000
490
+ },
491
+ {
492
+ "epoch": 1.247211518961671,
493
+ "grad_norm": 10.032007217407227,
494
+ "learning_rate": 2.0911497943777818e-05,
495
+ "loss": 6.0651,
496
+ "step": 12300
497
+ },
498
+ {
499
+ "epoch": 1.2776313121070777,
500
+ "grad_norm": 8.070867538452148,
501
+ "learning_rate": 2.0066475128161794e-05,
502
+ "loss": 6.0627,
503
+ "step": 12600
504
+ },
505
+ {
506
+ "epoch": 1.2776313121070777,
507
+ "eval_loss": 6.100788116455078,
508
+ "eval_runtime": 138.237,
509
+ "eval_samples_per_second": 720.95,
510
+ "eval_steps_per_second": 3.762,
511
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9255270604770037,
512
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9513841835406814,
513
+ "step": 12600
514
+ },
515
+ {
516
+ "epoch": 1.3080511052524844,
517
+ "grad_norm": 7.968441486358643,
518
+ "learning_rate": 1.9221452312545773e-05,
519
+ "loss": 6.0581,
520
+ "step": 12900
521
+ },
522
+ {
523
+ "epoch": 1.3384708983978908,
524
+ "grad_norm": 9.402104377746582,
525
+ "learning_rate": 1.837642949692975e-05,
526
+ "loss": 6.0487,
527
+ "step": 13200
528
+ },
529
+ {
530
+ "epoch": 1.3384708983978908,
531
+ "eval_loss": 6.084331512451172,
532
+ "eval_runtime": 138.9347,
533
+ "eval_samples_per_second": 717.33,
534
+ "eval_steps_per_second": 3.743,
535
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9269253835940942,
536
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9532928235476991,
537
+ "step": 13200
538
+ },
539
+ {
540
+ "epoch": 1.3688906915432975,
541
+ "grad_norm": 12.24899959564209,
542
+ "learning_rate": 1.7531406681313732e-05,
543
+ "loss": 6.0484,
544
+ "step": 13500
545
+ },
546
+ {
547
+ "epoch": 1.3993104846887041,
548
+ "grad_norm": 8.195536613464355,
549
+ "learning_rate": 1.6686383865697708e-05,
550
+ "loss": 6.0383,
551
+ "step": 13800
552
+ },
553
+ {
554
+ "epoch": 1.3993104846887041,
555
+ "eval_loss": 6.066676616668701,
556
+ "eval_runtime": 138.9021,
557
+ "eval_samples_per_second": 717.498,
558
+ "eval_steps_per_second": 3.744,
559
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9270862709285299,
560
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9549725220671897,
561
+ "step": 13800
562
+ },
563
+ {
564
+ "epoch": 1.4297302778341108,
565
+ "grad_norm": 8.008993148803711,
566
+ "learning_rate": 1.5841361050081684e-05,
567
+ "loss": 6.0207,
568
+ "step": 14100
569
+ },
570
+ {
571
+ "epoch": 1.4601500709795174,
572
+ "grad_norm": 8.773452758789062,
573
+ "learning_rate": 1.4996338234465665e-05,
574
+ "loss": 6.0266,
575
+ "step": 14400
576
+ },
577
+ {
578
+ "epoch": 1.4601500709795174,
579
+ "eval_loss": 6.052004337310791,
580
+ "eval_runtime": 137.7528,
581
+ "eval_samples_per_second": 723.484,
582
+ "eval_steps_per_second": 3.775,
583
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9299243076337231,
584
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.956161573137467,
585
+ "step": 14400
586
+ },
587
+ {
588
+ "epoch": 1.4905698641249239,
589
+ "grad_norm": 10.285622596740723,
590
+ "learning_rate": 1.4151315418849643e-05,
591
+ "loss": 6.0048,
592
+ "step": 14700
593
+ },
594
+ {
595
+ "epoch": 1.5209896572703305,
596
+ "grad_norm": 9.126446723937988,
597
+ "learning_rate": 1.3306292603233622e-05,
598
+ "loss": 6.0105,
599
+ "step": 15000
600
+ },
601
+ {
602
+ "epoch": 1.5209896572703305,
603
+ "eval_loss": 6.039182662963867,
604
+ "eval_runtime": 139.7353,
605
+ "eval_samples_per_second": 713.22,
606
+ "eval_steps_per_second": 3.721,
607
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9320725619273144,
608
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9572253199644789,
609
+ "step": 15000
610
+ },
611
+ {
612
+ "epoch": 1.5514094504157372,
613
+ "grad_norm": 17.142406463623047,
614
+ "learning_rate": 1.24612697876176e-05,
615
+ "loss": 6.0073,
616
+ "step": 15300
617
+ },
618
+ {
619
+ "epoch": 1.5818292435611436,
620
+ "grad_norm": 11.44053840637207,
621
+ "learning_rate": 1.1616246972001578e-05,
622
+ "loss": 5.9958,
623
+ "step": 15600
624
+ },
625
+ {
626
+ "epoch": 1.5818292435611436,
627
+ "eval_loss": 6.031214237213135,
628
+ "eval_runtime": 137.8568,
629
+ "eval_samples_per_second": 722.939,
630
+ "eval_steps_per_second": 3.772,
631
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9330012427722795,
632
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9585095917055348,
633
+ "step": 15600
634
+ },
635
+ {
636
+ "epoch": 1.6122490367065505,
637
+ "grad_norm": 12.4039945602417,
638
+ "learning_rate": 1.0771224156385557e-05,
639
+ "loss": 5.9887,
640
+ "step": 15900
641
+ },
642
+ {
643
+ "epoch": 1.642668829851957,
644
+ "grad_norm": 12.85715103149414,
645
+ "learning_rate": 9.926201340769535e-06,
646
+ "loss": 5.9864,
647
+ "step": 16200
648
+ },
649
+ {
650
+ "epoch": 1.642668829851957,
651
+ "eval_loss": 6.028384685516357,
652
+ "eval_runtime": 138.0256,
653
+ "eval_samples_per_second": 722.054,
654
+ "eval_steps_per_second": 3.767,
655
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.933956647989009,
656
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9594050626313877,
657
+ "step": 16200
658
+ },
659
+ {
660
+ "epoch": 1.6730886229973636,
661
+ "grad_norm": 15.63290023803711,
662
+ "learning_rate": 9.081178525153512e-06,
663
+ "loss": 5.9755,
664
+ "step": 16500
665
+ },
666
+ {
667
+ "epoch": 1.7035084161427703,
668
+ "grad_norm": 10.594440460205078,
669
+ "learning_rate": 8.23615570953749e-06,
670
+ "loss": 5.9698,
671
+ "step": 16800
672
+ },
673
+ {
674
+ "epoch": 1.7035084161427703,
675
+ "eval_loss": 6.011093616485596,
676
+ "eval_runtime": 137.6709,
677
+ "eval_samples_per_second": 723.915,
678
+ "eval_steps_per_second": 3.777,
679
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9355989385835076,
680
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.960712740224626,
681
+ "step": 16800
682
+ },
683
+ {
684
+ "epoch": 1.7339282092881767,
685
+ "grad_norm": 11.96275806427002,
686
+ "learning_rate": 7.391132893921469e-06,
687
+ "loss": 5.9602,
688
+ "step": 17100
689
+ },
690
+ {
691
+ "epoch": 1.7643480024335836,
692
+ "grad_norm": 12.022747993469238,
693
+ "learning_rate": 6.546110078305448e-06,
694
+ "loss": 5.9629,
695
+ "step": 17400
696
+ },
697
+ {
698
+ "epoch": 1.7643480024335836,
699
+ "eval_loss": 5.9960408210754395,
700
+ "eval_runtime": 136.7186,
701
+ "eval_samples_per_second": 728.957,
702
+ "eval_steps_per_second": 3.803,
703
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9376654138281357,
704
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9618521782819777,
705
+ "step": 17400
706
+ },
707
+ {
708
+ "epoch": 1.79476779557899,
709
+ "grad_norm": 10.690680503845215,
710
+ "learning_rate": 5.7010872626894265e-06,
711
+ "loss": 5.9538,
712
+ "step": 17700
713
+ },
714
+ {
715
+ "epoch": 1.8251875887243967,
716
+ "grad_norm": 10.452280044555664,
717
+ "learning_rate": 4.856064447073404e-06,
718
+ "loss": 5.9488,
719
+ "step": 18000
720
+ },
721
+ {
722
+ "epoch": 1.8251875887243967,
723
+ "eval_loss": 5.988218307495117,
724
+ "eval_runtime": 137.709,
725
+ "eval_samples_per_second": 723.715,
726
+ "eval_steps_per_second": 3.776,
727
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9396191760822983,
728
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9621528925529551,
729
+ "step": 18000
730
+ },
731
+ {
732
+ "epoch": 1.8556073818698033,
733
+ "grad_norm": 9.798408508300781,
734
+ "learning_rate": 4.011041631457383e-06,
735
+ "loss": 5.9453,
736
+ "step": 18300
737
+ },
738
+ {
739
+ "epoch": 1.8860271750152098,
740
+ "grad_norm": 14.103642463684082,
741
+ "learning_rate": 3.166018815841361e-06,
742
+ "loss": 5.9436,
743
+ "step": 18600
744
+ },
745
+ {
746
+ "epoch": 1.8860271750152098,
747
+ "eval_loss": 5.973294258117676,
748
+ "eval_runtime": 138.3639,
749
+ "eval_samples_per_second": 720.289,
750
+ "eval_steps_per_second": 3.758,
751
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9397452670967106,
752
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9633766022074037,
753
+ "step": 18600
754
+ },
755
+ {
756
+ "epoch": 1.9164469681606167,
757
+ "grad_norm": 13.882421493530273,
758
+ "learning_rate": 2.3209960002253393e-06,
759
+ "loss": 5.9432,
760
+ "step": 18900
761
+ },
762
+ {
763
+ "epoch": 1.946866761306023,
764
+ "grad_norm": 9.432927131652832,
765
+ "learning_rate": 1.4759731846093179e-06,
766
+ "loss": 5.9362,
767
+ "step": 19200
768
+ },
769
+ {
770
+ "epoch": 1.946866761306023,
771
+ "eval_loss": 5.966466426849365,
772
+ "eval_runtime": 137.9485,
773
+ "eval_samples_per_second": 722.458,
774
+ "eval_steps_per_second": 3.77,
775
+ "eval_xlm-roberta-large-msmarco-eval_pearson_cosine": 0.9399553483988181,
776
+ "eval_xlm-roberta-large-msmarco-eval_spearman_cosine": 0.9639025708580746,
777
+ "step": 19200
778
+ },
779
+ {
780
+ "epoch": 1.9772865544514298,
781
+ "grad_norm": 9.866859436035156,
782
+ "learning_rate": 6.309503689932962e-07,
783
+ "loss": 5.9272,
784
+ "step": 19500
785
+ }
786
+ ],
787
+ "logging_steps": 300,
788
+ "max_steps": 19724,
789
+ "num_input_tokens_seen": 0,
790
+ "num_train_epochs": 2,
791
+ "save_steps": 600,
792
+ "stateful_callbacks": {
793
+ "TrainerControl": {
794
+ "args": {
795
+ "should_epoch_stop": false,
796
+ "should_evaluate": false,
797
+ "should_log": false,
798
+ "should_save": true,
799
+ "should_training_stop": true
800
+ },
801
+ "attributes": {}
802
+ }
803
+ },
804
+ "total_flos": 0.0,
805
+ "train_batch_size": 48,
806
+ "trial_name": null,
807
+ "trial_params": null
808
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b665f0c03c0d25c36fb0929e203d6dfbabb0607973cc5bc3bec385d5c009438e
3
+ size 5624