Fedoration commited on
Commit
4e13d83
·
1 Parent(s): d6ff5b6

added checkpoints

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/config.json +40 -0
  2. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/optimizer.pt +3 -0
  3. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/pytorch_model.bin +3 -0
  4. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/rng_state.pth +3 -0
  5. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/scheduler.pt +3 -0
  6. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/trainer_state.json +217 -0
  7. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/training_args.bin +3 -0
  8. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/config.json +40 -0
  9. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/optimizer.pt +3 -0
  10. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/pytorch_model.bin +3 -0
  11. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/rng_state.pth +3 -0
  12. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/scheduler.pt +3 -0
  13. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/trainer_state.json +319 -0
  14. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/training_args.bin +3 -0
  15. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/config.json +40 -0
  16. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/optimizer.pt +3 -0
  17. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/pytorch_model.bin +3 -0
  18. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/rng_state.pth +3 -0
  19. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/scheduler.pt +3 -0
  20. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/trainer_state.json +421 -0
  21. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/training_args.bin +3 -0
  22. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/config.json +40 -0
  23. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/optimizer.pt +3 -0
  24. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/pytorch_model.bin +3 -0
  25. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/rng_state.pth +3 -0
  26. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/scheduler.pt +3 -0
  27. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/trainer_state.json +523 -0
  28. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/training_args.bin +3 -0
  29. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/config.json +40 -0
  30. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/optimizer.pt +3 -0
  31. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/pytorch_model.bin +3 -0
  32. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/rng_state.pth +3 -0
  33. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/scheduler.pt +3 -0
  34. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/trainer_state.json +625 -0
  35. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/training_args.bin +3 -0
  36. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/config.json +40 -0
  37. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/optimizer.pt +3 -0
  38. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/pytorch_model.bin +3 -0
  39. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/rng_state.pth +3 -0
  40. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/scheduler.pt +3 -0
  41. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/trainer_state.json +727 -0
  42. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/training_args.bin +3 -0
  43. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/config.json +40 -0
  44. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/optimizer.pt +3 -0
  45. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/pytorch_model.bin +3 -0
  46. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/rng_state.pth +3 -0
  47. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/scheduler.pt +3 -0
  48. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/trainer_state.json +829 -0
  49. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/training_args.bin +3 -0
  50. rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-67338/config.json +40 -0
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ec7b17b24be98bb5061bee26e48fcb0342f48a66630e356ef3575d4d7cba6d3
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45fd50b290bd60448e72ce386cbd24817b01c712099ea20c4888d14be5d08d06
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e6c4fae2dbc4e09dcb1916b3513181a62899bf27dcfddffe76449df6be937a2
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c6f90078a2450ac8516906119b963c86e82b14a8e8f9482b75b9f2e26282b4
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/trainer_state.json ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 14964,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ }
209
+ ],
210
+ "logging_steps": 500,
211
+ "max_steps": 74820,
212
+ "num_train_epochs": 10,
213
+ "save_steps": 500,
214
+ "total_flos": 1.7008214784542912e+17,
215
+ "trial_name": null,
216
+ "trial_params": null
217
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-14964/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5fd2e501a8385c447d794fe6ee9b83ef4c2e239115db3ba191ebc165a10e527f
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c233ab490e4a16384978560e9ce243b94e4d0edde66e9c7736ac72bc328cbed
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c94452baca72f151e8fd71f227f974658fc15f84f8df12aa691d214f2d3bb1
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95aa11d132c66709e53ab1861363db19e03cdd43abe90abb88ecfb1d00f88ed7
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/trainer_state.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 22446,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ },
209
+ {
210
+ "epoch": 2.0,
211
+ "learning_rate": 1.6097954790096878e-05,
212
+ "loss": 0.0109,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.07,
217
+ "learning_rate": 1.596340150699677e-05,
218
+ "loss": 0.0077,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.14,
223
+ "learning_rate": 1.5828848223896664e-05,
224
+ "loss": 0.0075,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.21,
229
+ "learning_rate": 1.5694294940796557e-05,
230
+ "loss": 0.0076,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.27,
235
+ "learning_rate": 1.555974165769645e-05,
236
+ "loss": 0.0076,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.34,
241
+ "learning_rate": 1.542518837459634e-05,
242
+ "loss": 0.0077,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 2.41,
247
+ "learning_rate": 1.5290635091496233e-05,
248
+ "loss": 0.0077,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 2.47,
253
+ "learning_rate": 1.5156081808396126e-05,
254
+ "loss": 0.0076,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 2.54,
259
+ "learning_rate": 1.5021528525296019e-05,
260
+ "loss": 0.0078,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 2.61,
265
+ "learning_rate": 1.4886975242195912e-05,
266
+ "loss": 0.008,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 2.67,
271
+ "learning_rate": 1.4752421959095805e-05,
272
+ "loss": 0.0077,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 2.74,
277
+ "learning_rate": 1.4617868675995694e-05,
278
+ "loss": 0.0078,
279
+ "step": 20500
280
+ },
281
+ {
282
+ "epoch": 2.81,
283
+ "learning_rate": 1.4483315392895587e-05,
284
+ "loss": 0.0081,
285
+ "step": 21000
286
+ },
287
+ {
288
+ "epoch": 2.87,
289
+ "learning_rate": 1.434876210979548e-05,
290
+ "loss": 0.0081,
291
+ "step": 21500
292
+ },
293
+ {
294
+ "epoch": 2.94,
295
+ "learning_rate": 1.4214208826695373e-05,
296
+ "loss": 0.0078,
297
+ "step": 22000
298
+ },
299
+ {
300
+ "epoch": 3.0,
301
+ "eval_accuracy": 0.9940838138558689,
302
+ "eval_f1": 0.0,
303
+ "eval_loss": 0.020097261294722557,
304
+ "eval_precision": 0.0,
305
+ "eval_recall": 0.0,
306
+ "eval_runtime": 71.581,
307
+ "eval_samples_per_second": 704.139,
308
+ "eval_steps_per_second": 5.504,
309
+ "step": 22446
310
+ }
311
+ ],
312
+ "logging_steps": 500,
313
+ "max_steps": 74820,
314
+ "num_train_epochs": 10,
315
+ "save_steps": 500,
316
+ "total_flos": 2.5512322176814368e+17,
317
+ "trial_name": null,
318
+ "trial_params": null
319
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-22446/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5a688150a58135e72dadfc5d4565bc907692c77456484bef730d739ff3a3399
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad549c3c9a145d0ba0d3601a1b8871ffef6ec50f3964391cf383c6aeb10a44da
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9bec1569f2b2b19949906563cbd6e2d8a69b35607a8366b6695d838b750331b
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5c5e1d1870ca9edfabe43cd9960ac818f48f088d52b8f2b3a331c9e576d215a
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/trainer_state.json ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
+ "eval_steps": 500,
6
+ "global_step": 29928,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ },
209
+ {
210
+ "epoch": 2.0,
211
+ "learning_rate": 1.6097954790096878e-05,
212
+ "loss": 0.0109,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.07,
217
+ "learning_rate": 1.596340150699677e-05,
218
+ "loss": 0.0077,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.14,
223
+ "learning_rate": 1.5828848223896664e-05,
224
+ "loss": 0.0075,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.21,
229
+ "learning_rate": 1.5694294940796557e-05,
230
+ "loss": 0.0076,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.27,
235
+ "learning_rate": 1.555974165769645e-05,
236
+ "loss": 0.0076,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.34,
241
+ "learning_rate": 1.542518837459634e-05,
242
+ "loss": 0.0077,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 2.41,
247
+ "learning_rate": 1.5290635091496233e-05,
248
+ "loss": 0.0077,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 2.47,
253
+ "learning_rate": 1.5156081808396126e-05,
254
+ "loss": 0.0076,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 2.54,
259
+ "learning_rate": 1.5021528525296019e-05,
260
+ "loss": 0.0078,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 2.61,
265
+ "learning_rate": 1.4886975242195912e-05,
266
+ "loss": 0.008,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 2.67,
271
+ "learning_rate": 1.4752421959095805e-05,
272
+ "loss": 0.0077,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 2.74,
277
+ "learning_rate": 1.4617868675995694e-05,
278
+ "loss": 0.0078,
279
+ "step": 20500
280
+ },
281
+ {
282
+ "epoch": 2.81,
283
+ "learning_rate": 1.4483315392895587e-05,
284
+ "loss": 0.0081,
285
+ "step": 21000
286
+ },
287
+ {
288
+ "epoch": 2.87,
289
+ "learning_rate": 1.434876210979548e-05,
290
+ "loss": 0.0081,
291
+ "step": 21500
292
+ },
293
+ {
294
+ "epoch": 2.94,
295
+ "learning_rate": 1.4214208826695373e-05,
296
+ "loss": 0.0078,
297
+ "step": 22000
298
+ },
299
+ {
300
+ "epoch": 3.0,
301
+ "eval_accuracy": 0.9940838138558689,
302
+ "eval_f1": 0.0,
303
+ "eval_loss": 0.020097261294722557,
304
+ "eval_precision": 0.0,
305
+ "eval_recall": 0.0,
306
+ "eval_runtime": 71.581,
307
+ "eval_samples_per_second": 704.139,
308
+ "eval_steps_per_second": 5.504,
309
+ "step": 22446
310
+ },
311
+ {
312
+ "epoch": 3.01,
313
+ "learning_rate": 1.4079655543595265e-05,
314
+ "loss": 0.0074,
315
+ "step": 22500
316
+ },
317
+ {
318
+ "epoch": 3.07,
319
+ "learning_rate": 1.3945102260495158e-05,
320
+ "loss": 0.0051,
321
+ "step": 23000
322
+ },
323
+ {
324
+ "epoch": 3.14,
325
+ "learning_rate": 1.3810548977395049e-05,
326
+ "loss": 0.0052,
327
+ "step": 23500
328
+ },
329
+ {
330
+ "epoch": 3.21,
331
+ "learning_rate": 1.3675995694294942e-05,
332
+ "loss": 0.005,
333
+ "step": 24000
334
+ },
335
+ {
336
+ "epoch": 3.27,
337
+ "learning_rate": 1.3541442411194833e-05,
338
+ "loss": 0.0056,
339
+ "step": 24500
340
+ },
341
+ {
342
+ "epoch": 3.34,
343
+ "learning_rate": 1.3406889128094726e-05,
344
+ "loss": 0.0049,
345
+ "step": 25000
346
+ },
347
+ {
348
+ "epoch": 3.41,
349
+ "learning_rate": 1.3272335844994619e-05,
350
+ "loss": 0.0054,
351
+ "step": 25500
352
+ },
353
+ {
354
+ "epoch": 3.48,
355
+ "learning_rate": 1.3137782561894512e-05,
356
+ "loss": 0.0052,
357
+ "step": 26000
358
+ },
359
+ {
360
+ "epoch": 3.54,
361
+ "learning_rate": 1.3003229278794403e-05,
362
+ "loss": 0.0055,
363
+ "step": 26500
364
+ },
365
+ {
366
+ "epoch": 3.61,
367
+ "learning_rate": 1.2868675995694295e-05,
368
+ "loss": 0.0053,
369
+ "step": 27000
370
+ },
371
+ {
372
+ "epoch": 3.68,
373
+ "learning_rate": 1.2734122712594188e-05,
374
+ "loss": 0.0055,
375
+ "step": 27500
376
+ },
377
+ {
378
+ "epoch": 3.74,
379
+ "learning_rate": 1.259956942949408e-05,
380
+ "loss": 0.0056,
381
+ "step": 28000
382
+ },
383
+ {
384
+ "epoch": 3.81,
385
+ "learning_rate": 1.2465016146393974e-05,
386
+ "loss": 0.0056,
387
+ "step": 28500
388
+ },
389
+ {
390
+ "epoch": 3.88,
391
+ "learning_rate": 1.2330462863293867e-05,
392
+ "loss": 0.0056,
393
+ "step": 29000
394
+ },
395
+ {
396
+ "epoch": 3.94,
397
+ "learning_rate": 1.219590958019376e-05,
398
+ "loss": 0.0056,
399
+ "step": 29500
400
+ },
401
+ {
402
+ "epoch": 4.0,
403
+ "eval_accuracy": 0.9940216536800652,
404
+ "eval_f1": 0.0,
405
+ "eval_loss": 0.023068198934197426,
406
+ "eval_precision": 0.0,
407
+ "eval_recall": 0.0,
408
+ "eval_runtime": 81.8028,
409
+ "eval_samples_per_second": 616.153,
410
+ "eval_steps_per_second": 4.816,
411
+ "step": 29928
412
+ }
413
+ ],
414
+ "logging_steps": 500,
415
+ "max_steps": 74820,
416
+ "num_train_epochs": 10,
417
+ "save_steps": 500,
418
+ "total_flos": 3.4016429569085824e+17,
419
+ "trial_name": null,
420
+ "trial_params": null
421
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-29928/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d46bdaf7365344d25ab56a273686b443074ac5df5b3da18af552adfb8bb0159
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:420b30ad532637403259d1699e64db702f87677d6643105e5e5d9d86ebb4f92e
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21ea456c1c79bc9fb04e070c6d6156fad4a9b07018a34e6d964fb53c806593e2
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dcee0eaa0686cccc9bf89fbdc18b90fe7a8add00f6fc71d0c5923474a451192
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/trainer_state.json ADDED
@@ -0,0 +1,523 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 37410,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ },
209
+ {
210
+ "epoch": 2.0,
211
+ "learning_rate": 1.6097954790096878e-05,
212
+ "loss": 0.0109,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.07,
217
+ "learning_rate": 1.596340150699677e-05,
218
+ "loss": 0.0077,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.14,
223
+ "learning_rate": 1.5828848223896664e-05,
224
+ "loss": 0.0075,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.21,
229
+ "learning_rate": 1.5694294940796557e-05,
230
+ "loss": 0.0076,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.27,
235
+ "learning_rate": 1.555974165769645e-05,
236
+ "loss": 0.0076,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.34,
241
+ "learning_rate": 1.542518837459634e-05,
242
+ "loss": 0.0077,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 2.41,
247
+ "learning_rate": 1.5290635091496233e-05,
248
+ "loss": 0.0077,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 2.47,
253
+ "learning_rate": 1.5156081808396126e-05,
254
+ "loss": 0.0076,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 2.54,
259
+ "learning_rate": 1.5021528525296019e-05,
260
+ "loss": 0.0078,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 2.61,
265
+ "learning_rate": 1.4886975242195912e-05,
266
+ "loss": 0.008,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 2.67,
271
+ "learning_rate": 1.4752421959095805e-05,
272
+ "loss": 0.0077,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 2.74,
277
+ "learning_rate": 1.4617868675995694e-05,
278
+ "loss": 0.0078,
279
+ "step": 20500
280
+ },
281
+ {
282
+ "epoch": 2.81,
283
+ "learning_rate": 1.4483315392895587e-05,
284
+ "loss": 0.0081,
285
+ "step": 21000
286
+ },
287
+ {
288
+ "epoch": 2.87,
289
+ "learning_rate": 1.434876210979548e-05,
290
+ "loss": 0.0081,
291
+ "step": 21500
292
+ },
293
+ {
294
+ "epoch": 2.94,
295
+ "learning_rate": 1.4214208826695373e-05,
296
+ "loss": 0.0078,
297
+ "step": 22000
298
+ },
299
+ {
300
+ "epoch": 3.0,
301
+ "eval_accuracy": 0.9940838138558689,
302
+ "eval_f1": 0.0,
303
+ "eval_loss": 0.020097261294722557,
304
+ "eval_precision": 0.0,
305
+ "eval_recall": 0.0,
306
+ "eval_runtime": 71.581,
307
+ "eval_samples_per_second": 704.139,
308
+ "eval_steps_per_second": 5.504,
309
+ "step": 22446
310
+ },
311
+ {
312
+ "epoch": 3.01,
313
+ "learning_rate": 1.4079655543595265e-05,
314
+ "loss": 0.0074,
315
+ "step": 22500
316
+ },
317
+ {
318
+ "epoch": 3.07,
319
+ "learning_rate": 1.3945102260495158e-05,
320
+ "loss": 0.0051,
321
+ "step": 23000
322
+ },
323
+ {
324
+ "epoch": 3.14,
325
+ "learning_rate": 1.3810548977395049e-05,
326
+ "loss": 0.0052,
327
+ "step": 23500
328
+ },
329
+ {
330
+ "epoch": 3.21,
331
+ "learning_rate": 1.3675995694294942e-05,
332
+ "loss": 0.005,
333
+ "step": 24000
334
+ },
335
+ {
336
+ "epoch": 3.27,
337
+ "learning_rate": 1.3541442411194833e-05,
338
+ "loss": 0.0056,
339
+ "step": 24500
340
+ },
341
+ {
342
+ "epoch": 3.34,
343
+ "learning_rate": 1.3406889128094726e-05,
344
+ "loss": 0.0049,
345
+ "step": 25000
346
+ },
347
+ {
348
+ "epoch": 3.41,
349
+ "learning_rate": 1.3272335844994619e-05,
350
+ "loss": 0.0054,
351
+ "step": 25500
352
+ },
353
+ {
354
+ "epoch": 3.48,
355
+ "learning_rate": 1.3137782561894512e-05,
356
+ "loss": 0.0052,
357
+ "step": 26000
358
+ },
359
+ {
360
+ "epoch": 3.54,
361
+ "learning_rate": 1.3003229278794403e-05,
362
+ "loss": 0.0055,
363
+ "step": 26500
364
+ },
365
+ {
366
+ "epoch": 3.61,
367
+ "learning_rate": 1.2868675995694295e-05,
368
+ "loss": 0.0053,
369
+ "step": 27000
370
+ },
371
+ {
372
+ "epoch": 3.68,
373
+ "learning_rate": 1.2734122712594188e-05,
374
+ "loss": 0.0055,
375
+ "step": 27500
376
+ },
377
+ {
378
+ "epoch": 3.74,
379
+ "learning_rate": 1.259956942949408e-05,
380
+ "loss": 0.0056,
381
+ "step": 28000
382
+ },
383
+ {
384
+ "epoch": 3.81,
385
+ "learning_rate": 1.2465016146393974e-05,
386
+ "loss": 0.0056,
387
+ "step": 28500
388
+ },
389
+ {
390
+ "epoch": 3.88,
391
+ "learning_rate": 1.2330462863293867e-05,
392
+ "loss": 0.0056,
393
+ "step": 29000
394
+ },
395
+ {
396
+ "epoch": 3.94,
397
+ "learning_rate": 1.219590958019376e-05,
398
+ "loss": 0.0056,
399
+ "step": 29500
400
+ },
401
+ {
402
+ "epoch": 4.0,
403
+ "eval_accuracy": 0.9940216536800652,
404
+ "eval_f1": 0.0,
405
+ "eval_loss": 0.023068198934197426,
406
+ "eval_precision": 0.0,
407
+ "eval_recall": 0.0,
408
+ "eval_runtime": 81.8028,
409
+ "eval_samples_per_second": 616.153,
410
+ "eval_steps_per_second": 4.816,
411
+ "step": 29928
412
+ },
413
+ {
414
+ "epoch": 4.01,
415
+ "learning_rate": 1.206135629709365e-05,
416
+ "loss": 0.0053,
417
+ "step": 30000
418
+ },
419
+ {
420
+ "epoch": 4.08,
421
+ "learning_rate": 1.1926803013993542e-05,
422
+ "loss": 0.0035,
423
+ "step": 30500
424
+ },
425
+ {
426
+ "epoch": 4.14,
427
+ "learning_rate": 1.1792249730893435e-05,
428
+ "loss": 0.0037,
429
+ "step": 31000
430
+ },
431
+ {
432
+ "epoch": 4.21,
433
+ "learning_rate": 1.1657696447793328e-05,
434
+ "loss": 0.0039,
435
+ "step": 31500
436
+ },
437
+ {
438
+ "epoch": 4.28,
439
+ "learning_rate": 1.152314316469322e-05,
440
+ "loss": 0.0037,
441
+ "step": 32000
442
+ },
443
+ {
444
+ "epoch": 4.34,
445
+ "learning_rate": 1.1388589881593113e-05,
446
+ "loss": 0.0038,
447
+ "step": 32500
448
+ },
449
+ {
450
+ "epoch": 4.41,
451
+ "learning_rate": 1.1254036598493004e-05,
452
+ "loss": 0.004,
453
+ "step": 33000
454
+ },
455
+ {
456
+ "epoch": 4.48,
457
+ "learning_rate": 1.1119483315392897e-05,
458
+ "loss": 0.0039,
459
+ "step": 33500
460
+ },
461
+ {
462
+ "epoch": 4.54,
463
+ "learning_rate": 1.0984930032292788e-05,
464
+ "loss": 0.004,
465
+ "step": 34000
466
+ },
467
+ {
468
+ "epoch": 4.61,
469
+ "learning_rate": 1.0850376749192681e-05,
470
+ "loss": 0.0041,
471
+ "step": 34500
472
+ },
473
+ {
474
+ "epoch": 4.68,
475
+ "learning_rate": 1.0715823466092574e-05,
476
+ "loss": 0.0042,
477
+ "step": 35000
478
+ },
479
+ {
480
+ "epoch": 4.74,
481
+ "learning_rate": 1.0581270182992467e-05,
482
+ "loss": 0.004,
483
+ "step": 35500
484
+ },
485
+ {
486
+ "epoch": 4.81,
487
+ "learning_rate": 1.0446716899892357e-05,
488
+ "loss": 0.0042,
489
+ "step": 36000
490
+ },
491
+ {
492
+ "epoch": 4.88,
493
+ "learning_rate": 1.031216361679225e-05,
494
+ "loss": 0.0041,
495
+ "step": 36500
496
+ },
497
+ {
498
+ "epoch": 4.95,
499
+ "learning_rate": 1.0177610333692143e-05,
500
+ "loss": 0.0042,
501
+ "step": 37000
502
+ },
503
+ {
504
+ "epoch": 5.0,
505
+ "eval_accuracy": 0.9940509055275022,
506
+ "eval_f1": 0.0,
507
+ "eval_loss": 0.025542501360177994,
508
+ "eval_precision": 0.0,
509
+ "eval_recall": 0.0,
510
+ "eval_runtime": 71.0096,
511
+ "eval_samples_per_second": 709.806,
512
+ "eval_steps_per_second": 5.549,
513
+ "step": 37410
514
+ }
515
+ ],
516
+ "logging_steps": 500,
517
+ "max_steps": 74820,
518
+ "num_train_epochs": 10,
519
+ "save_steps": 500,
520
+ "total_flos": 4.252053696135728e+17,
521
+ "trial_name": null,
522
+ "trial_params": null
523
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-37410/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9de640f5776860cd313a47097c975e864bb42ca47fc51730f89698d9d2857cc4
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bab6353e25df2b251f3b7462ad687d29b3d68929b15b559c28369de6db587918
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2ee5b3486670d33998adce168a823ad85a897e28c706d6b3b4c7d317ac3008b
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04b8e9929c5c026cdb0866b798ad37fddb86cbbceb958e03b72203a2e6c7d4dd
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/trainer_state.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 6.0,
5
+ "eval_steps": 500,
6
+ "global_step": 44892,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ },
209
+ {
210
+ "epoch": 2.0,
211
+ "learning_rate": 1.6097954790096878e-05,
212
+ "loss": 0.0109,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.07,
217
+ "learning_rate": 1.596340150699677e-05,
218
+ "loss": 0.0077,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.14,
223
+ "learning_rate": 1.5828848223896664e-05,
224
+ "loss": 0.0075,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.21,
229
+ "learning_rate": 1.5694294940796557e-05,
230
+ "loss": 0.0076,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.27,
235
+ "learning_rate": 1.555974165769645e-05,
236
+ "loss": 0.0076,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.34,
241
+ "learning_rate": 1.542518837459634e-05,
242
+ "loss": 0.0077,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 2.41,
247
+ "learning_rate": 1.5290635091496233e-05,
248
+ "loss": 0.0077,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 2.47,
253
+ "learning_rate": 1.5156081808396126e-05,
254
+ "loss": 0.0076,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 2.54,
259
+ "learning_rate": 1.5021528525296019e-05,
260
+ "loss": 0.0078,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 2.61,
265
+ "learning_rate": 1.4886975242195912e-05,
266
+ "loss": 0.008,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 2.67,
271
+ "learning_rate": 1.4752421959095805e-05,
272
+ "loss": 0.0077,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 2.74,
277
+ "learning_rate": 1.4617868675995694e-05,
278
+ "loss": 0.0078,
279
+ "step": 20500
280
+ },
281
+ {
282
+ "epoch": 2.81,
283
+ "learning_rate": 1.4483315392895587e-05,
284
+ "loss": 0.0081,
285
+ "step": 21000
286
+ },
287
+ {
288
+ "epoch": 2.87,
289
+ "learning_rate": 1.434876210979548e-05,
290
+ "loss": 0.0081,
291
+ "step": 21500
292
+ },
293
+ {
294
+ "epoch": 2.94,
295
+ "learning_rate": 1.4214208826695373e-05,
296
+ "loss": 0.0078,
297
+ "step": 22000
298
+ },
299
+ {
300
+ "epoch": 3.0,
301
+ "eval_accuracy": 0.9940838138558689,
302
+ "eval_f1": 0.0,
303
+ "eval_loss": 0.020097261294722557,
304
+ "eval_precision": 0.0,
305
+ "eval_recall": 0.0,
306
+ "eval_runtime": 71.581,
307
+ "eval_samples_per_second": 704.139,
308
+ "eval_steps_per_second": 5.504,
309
+ "step": 22446
310
+ },
311
+ {
312
+ "epoch": 3.01,
313
+ "learning_rate": 1.4079655543595265e-05,
314
+ "loss": 0.0074,
315
+ "step": 22500
316
+ },
317
+ {
318
+ "epoch": 3.07,
319
+ "learning_rate": 1.3945102260495158e-05,
320
+ "loss": 0.0051,
321
+ "step": 23000
322
+ },
323
+ {
324
+ "epoch": 3.14,
325
+ "learning_rate": 1.3810548977395049e-05,
326
+ "loss": 0.0052,
327
+ "step": 23500
328
+ },
329
+ {
330
+ "epoch": 3.21,
331
+ "learning_rate": 1.3675995694294942e-05,
332
+ "loss": 0.005,
333
+ "step": 24000
334
+ },
335
+ {
336
+ "epoch": 3.27,
337
+ "learning_rate": 1.3541442411194833e-05,
338
+ "loss": 0.0056,
339
+ "step": 24500
340
+ },
341
+ {
342
+ "epoch": 3.34,
343
+ "learning_rate": 1.3406889128094726e-05,
344
+ "loss": 0.0049,
345
+ "step": 25000
346
+ },
347
+ {
348
+ "epoch": 3.41,
349
+ "learning_rate": 1.3272335844994619e-05,
350
+ "loss": 0.0054,
351
+ "step": 25500
352
+ },
353
+ {
354
+ "epoch": 3.48,
355
+ "learning_rate": 1.3137782561894512e-05,
356
+ "loss": 0.0052,
357
+ "step": 26000
358
+ },
359
+ {
360
+ "epoch": 3.54,
361
+ "learning_rate": 1.3003229278794403e-05,
362
+ "loss": 0.0055,
363
+ "step": 26500
364
+ },
365
+ {
366
+ "epoch": 3.61,
367
+ "learning_rate": 1.2868675995694295e-05,
368
+ "loss": 0.0053,
369
+ "step": 27000
370
+ },
371
+ {
372
+ "epoch": 3.68,
373
+ "learning_rate": 1.2734122712594188e-05,
374
+ "loss": 0.0055,
375
+ "step": 27500
376
+ },
377
+ {
378
+ "epoch": 3.74,
379
+ "learning_rate": 1.259956942949408e-05,
380
+ "loss": 0.0056,
381
+ "step": 28000
382
+ },
383
+ {
384
+ "epoch": 3.81,
385
+ "learning_rate": 1.2465016146393974e-05,
386
+ "loss": 0.0056,
387
+ "step": 28500
388
+ },
389
+ {
390
+ "epoch": 3.88,
391
+ "learning_rate": 1.2330462863293867e-05,
392
+ "loss": 0.0056,
393
+ "step": 29000
394
+ },
395
+ {
396
+ "epoch": 3.94,
397
+ "learning_rate": 1.219590958019376e-05,
398
+ "loss": 0.0056,
399
+ "step": 29500
400
+ },
401
+ {
402
+ "epoch": 4.0,
403
+ "eval_accuracy": 0.9940216536800652,
404
+ "eval_f1": 0.0,
405
+ "eval_loss": 0.023068198934197426,
406
+ "eval_precision": 0.0,
407
+ "eval_recall": 0.0,
408
+ "eval_runtime": 81.8028,
409
+ "eval_samples_per_second": 616.153,
410
+ "eval_steps_per_second": 4.816,
411
+ "step": 29928
412
+ },
413
+ {
414
+ "epoch": 4.01,
415
+ "learning_rate": 1.206135629709365e-05,
416
+ "loss": 0.0053,
417
+ "step": 30000
418
+ },
419
+ {
420
+ "epoch": 4.08,
421
+ "learning_rate": 1.1926803013993542e-05,
422
+ "loss": 0.0035,
423
+ "step": 30500
424
+ },
425
+ {
426
+ "epoch": 4.14,
427
+ "learning_rate": 1.1792249730893435e-05,
428
+ "loss": 0.0037,
429
+ "step": 31000
430
+ },
431
+ {
432
+ "epoch": 4.21,
433
+ "learning_rate": 1.1657696447793328e-05,
434
+ "loss": 0.0039,
435
+ "step": 31500
436
+ },
437
+ {
438
+ "epoch": 4.28,
439
+ "learning_rate": 1.152314316469322e-05,
440
+ "loss": 0.0037,
441
+ "step": 32000
442
+ },
443
+ {
444
+ "epoch": 4.34,
445
+ "learning_rate": 1.1388589881593113e-05,
446
+ "loss": 0.0038,
447
+ "step": 32500
448
+ },
449
+ {
450
+ "epoch": 4.41,
451
+ "learning_rate": 1.1254036598493004e-05,
452
+ "loss": 0.004,
453
+ "step": 33000
454
+ },
455
+ {
456
+ "epoch": 4.48,
457
+ "learning_rate": 1.1119483315392897e-05,
458
+ "loss": 0.0039,
459
+ "step": 33500
460
+ },
461
+ {
462
+ "epoch": 4.54,
463
+ "learning_rate": 1.0984930032292788e-05,
464
+ "loss": 0.004,
465
+ "step": 34000
466
+ },
467
+ {
468
+ "epoch": 4.61,
469
+ "learning_rate": 1.0850376749192681e-05,
470
+ "loss": 0.0041,
471
+ "step": 34500
472
+ },
473
+ {
474
+ "epoch": 4.68,
475
+ "learning_rate": 1.0715823466092574e-05,
476
+ "loss": 0.0042,
477
+ "step": 35000
478
+ },
479
+ {
480
+ "epoch": 4.74,
481
+ "learning_rate": 1.0581270182992467e-05,
482
+ "loss": 0.004,
483
+ "step": 35500
484
+ },
485
+ {
486
+ "epoch": 4.81,
487
+ "learning_rate": 1.0446716899892357e-05,
488
+ "loss": 0.0042,
489
+ "step": 36000
490
+ },
491
+ {
492
+ "epoch": 4.88,
493
+ "learning_rate": 1.031216361679225e-05,
494
+ "loss": 0.0041,
495
+ "step": 36500
496
+ },
497
+ {
498
+ "epoch": 4.95,
499
+ "learning_rate": 1.0177610333692143e-05,
500
+ "loss": 0.0042,
501
+ "step": 37000
502
+ },
503
+ {
504
+ "epoch": 5.0,
505
+ "eval_accuracy": 0.9940509055275022,
506
+ "eval_f1": 0.0,
507
+ "eval_loss": 0.025542501360177994,
508
+ "eval_precision": 0.0,
509
+ "eval_recall": 0.0,
510
+ "eval_runtime": 71.0096,
511
+ "eval_samples_per_second": 709.806,
512
+ "eval_steps_per_second": 5.549,
513
+ "step": 37410
514
+ },
515
+ {
516
+ "epoch": 5.01,
517
+ "learning_rate": 1.0043057050592036e-05,
518
+ "loss": 0.0038,
519
+ "step": 37500
520
+ },
521
+ {
522
+ "epoch": 5.08,
523
+ "learning_rate": 9.908503767491927e-06,
524
+ "loss": 0.0029,
525
+ "step": 38000
526
+ },
527
+ {
528
+ "epoch": 5.15,
529
+ "learning_rate": 9.77395048439182e-06,
530
+ "loss": 0.003,
531
+ "step": 38500
532
+ },
533
+ {
534
+ "epoch": 5.21,
535
+ "learning_rate": 9.639397201291713e-06,
536
+ "loss": 0.003,
537
+ "step": 39000
538
+ },
539
+ {
540
+ "epoch": 5.28,
541
+ "learning_rate": 9.504843918191604e-06,
542
+ "loss": 0.003,
543
+ "step": 39500
544
+ },
545
+ {
546
+ "epoch": 5.35,
547
+ "learning_rate": 9.370290635091497e-06,
548
+ "loss": 0.0029,
549
+ "step": 40000
550
+ },
551
+ {
552
+ "epoch": 5.41,
553
+ "learning_rate": 9.23573735199139e-06,
554
+ "loss": 0.0031,
555
+ "step": 40500
556
+ },
557
+ {
558
+ "epoch": 5.48,
559
+ "learning_rate": 9.101184068891282e-06,
560
+ "loss": 0.0031,
561
+ "step": 41000
562
+ },
563
+ {
564
+ "epoch": 5.55,
565
+ "learning_rate": 8.966630785791175e-06,
566
+ "loss": 0.0031,
567
+ "step": 41500
568
+ },
569
+ {
570
+ "epoch": 5.61,
571
+ "learning_rate": 8.832077502691066e-06,
572
+ "loss": 0.0032,
573
+ "step": 42000
574
+ },
575
+ {
576
+ "epoch": 5.68,
577
+ "learning_rate": 8.697524219590959e-06,
578
+ "loss": 0.003,
579
+ "step": 42500
580
+ },
581
+ {
582
+ "epoch": 5.75,
583
+ "learning_rate": 8.56297093649085e-06,
584
+ "loss": 0.0032,
585
+ "step": 43000
586
+ },
587
+ {
588
+ "epoch": 5.81,
589
+ "learning_rate": 8.428417653390743e-06,
590
+ "loss": 0.0033,
591
+ "step": 43500
592
+ },
593
+ {
594
+ "epoch": 5.88,
595
+ "learning_rate": 8.293864370290636e-06,
596
+ "loss": 0.0032,
597
+ "step": 44000
598
+ },
599
+ {
600
+ "epoch": 5.95,
601
+ "learning_rate": 8.159311087190527e-06,
602
+ "loss": 0.0032,
603
+ "step": 44500
604
+ },
605
+ {
606
+ "epoch": 6.0,
607
+ "eval_accuracy": 0.9940655314512207,
608
+ "eval_f1": 0.0,
609
+ "eval_loss": 0.031192485243082047,
610
+ "eval_precision": 0.0,
611
+ "eval_recall": 0.0,
612
+ "eval_runtime": 81.6971,
613
+ "eval_samples_per_second": 616.95,
614
+ "eval_steps_per_second": 4.823,
615
+ "step": 44892
616
+ }
617
+ ],
618
+ "logging_steps": 500,
619
+ "max_steps": 74820,
620
+ "num_train_epochs": 10,
621
+ "save_steps": 500,
622
+ "total_flos": 5.1024644353628736e+17,
623
+ "trial_name": null,
624
+ "trial_params": null
625
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-44892/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a2c94b3fa4105f0848e039f53ebc926d5c50bab3ec3ef6ee702b2cac182fc88
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e93ea760e69150887aa70367fc7c023cde354ea5bbdc7677ff27694947e6896b
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff9f9ba597613c9b6de2929cd52809617964b278efcc8d2d7f05547c143c6c43
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58cdca43612bcf5318fcbf9957a0b062eac9129103ad04c51d2784c587eaceab
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/trainer_state.json ADDED
@@ -0,0 +1,727 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 7.0,
5
+ "eval_steps": 500,
6
+ "global_step": 52374,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ },
209
+ {
210
+ "epoch": 2.0,
211
+ "learning_rate": 1.6097954790096878e-05,
212
+ "loss": 0.0109,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.07,
217
+ "learning_rate": 1.596340150699677e-05,
218
+ "loss": 0.0077,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.14,
223
+ "learning_rate": 1.5828848223896664e-05,
224
+ "loss": 0.0075,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.21,
229
+ "learning_rate": 1.5694294940796557e-05,
230
+ "loss": 0.0076,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.27,
235
+ "learning_rate": 1.555974165769645e-05,
236
+ "loss": 0.0076,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.34,
241
+ "learning_rate": 1.542518837459634e-05,
242
+ "loss": 0.0077,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 2.41,
247
+ "learning_rate": 1.5290635091496233e-05,
248
+ "loss": 0.0077,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 2.47,
253
+ "learning_rate": 1.5156081808396126e-05,
254
+ "loss": 0.0076,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 2.54,
259
+ "learning_rate": 1.5021528525296019e-05,
260
+ "loss": 0.0078,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 2.61,
265
+ "learning_rate": 1.4886975242195912e-05,
266
+ "loss": 0.008,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 2.67,
271
+ "learning_rate": 1.4752421959095805e-05,
272
+ "loss": 0.0077,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 2.74,
277
+ "learning_rate": 1.4617868675995694e-05,
278
+ "loss": 0.0078,
279
+ "step": 20500
280
+ },
281
+ {
282
+ "epoch": 2.81,
283
+ "learning_rate": 1.4483315392895587e-05,
284
+ "loss": 0.0081,
285
+ "step": 21000
286
+ },
287
+ {
288
+ "epoch": 2.87,
289
+ "learning_rate": 1.434876210979548e-05,
290
+ "loss": 0.0081,
291
+ "step": 21500
292
+ },
293
+ {
294
+ "epoch": 2.94,
295
+ "learning_rate": 1.4214208826695373e-05,
296
+ "loss": 0.0078,
297
+ "step": 22000
298
+ },
299
+ {
300
+ "epoch": 3.0,
301
+ "eval_accuracy": 0.9940838138558689,
302
+ "eval_f1": 0.0,
303
+ "eval_loss": 0.020097261294722557,
304
+ "eval_precision": 0.0,
305
+ "eval_recall": 0.0,
306
+ "eval_runtime": 71.581,
307
+ "eval_samples_per_second": 704.139,
308
+ "eval_steps_per_second": 5.504,
309
+ "step": 22446
310
+ },
311
+ {
312
+ "epoch": 3.01,
313
+ "learning_rate": 1.4079655543595265e-05,
314
+ "loss": 0.0074,
315
+ "step": 22500
316
+ },
317
+ {
318
+ "epoch": 3.07,
319
+ "learning_rate": 1.3945102260495158e-05,
320
+ "loss": 0.0051,
321
+ "step": 23000
322
+ },
323
+ {
324
+ "epoch": 3.14,
325
+ "learning_rate": 1.3810548977395049e-05,
326
+ "loss": 0.0052,
327
+ "step": 23500
328
+ },
329
+ {
330
+ "epoch": 3.21,
331
+ "learning_rate": 1.3675995694294942e-05,
332
+ "loss": 0.005,
333
+ "step": 24000
334
+ },
335
+ {
336
+ "epoch": 3.27,
337
+ "learning_rate": 1.3541442411194833e-05,
338
+ "loss": 0.0056,
339
+ "step": 24500
340
+ },
341
+ {
342
+ "epoch": 3.34,
343
+ "learning_rate": 1.3406889128094726e-05,
344
+ "loss": 0.0049,
345
+ "step": 25000
346
+ },
347
+ {
348
+ "epoch": 3.41,
349
+ "learning_rate": 1.3272335844994619e-05,
350
+ "loss": 0.0054,
351
+ "step": 25500
352
+ },
353
+ {
354
+ "epoch": 3.48,
355
+ "learning_rate": 1.3137782561894512e-05,
356
+ "loss": 0.0052,
357
+ "step": 26000
358
+ },
359
+ {
360
+ "epoch": 3.54,
361
+ "learning_rate": 1.3003229278794403e-05,
362
+ "loss": 0.0055,
363
+ "step": 26500
364
+ },
365
+ {
366
+ "epoch": 3.61,
367
+ "learning_rate": 1.2868675995694295e-05,
368
+ "loss": 0.0053,
369
+ "step": 27000
370
+ },
371
+ {
372
+ "epoch": 3.68,
373
+ "learning_rate": 1.2734122712594188e-05,
374
+ "loss": 0.0055,
375
+ "step": 27500
376
+ },
377
+ {
378
+ "epoch": 3.74,
379
+ "learning_rate": 1.259956942949408e-05,
380
+ "loss": 0.0056,
381
+ "step": 28000
382
+ },
383
+ {
384
+ "epoch": 3.81,
385
+ "learning_rate": 1.2465016146393974e-05,
386
+ "loss": 0.0056,
387
+ "step": 28500
388
+ },
389
+ {
390
+ "epoch": 3.88,
391
+ "learning_rate": 1.2330462863293867e-05,
392
+ "loss": 0.0056,
393
+ "step": 29000
394
+ },
395
+ {
396
+ "epoch": 3.94,
397
+ "learning_rate": 1.219590958019376e-05,
398
+ "loss": 0.0056,
399
+ "step": 29500
400
+ },
401
+ {
402
+ "epoch": 4.0,
403
+ "eval_accuracy": 0.9940216536800652,
404
+ "eval_f1": 0.0,
405
+ "eval_loss": 0.023068198934197426,
406
+ "eval_precision": 0.0,
407
+ "eval_recall": 0.0,
408
+ "eval_runtime": 81.8028,
409
+ "eval_samples_per_second": 616.153,
410
+ "eval_steps_per_second": 4.816,
411
+ "step": 29928
412
+ },
413
+ {
414
+ "epoch": 4.01,
415
+ "learning_rate": 1.206135629709365e-05,
416
+ "loss": 0.0053,
417
+ "step": 30000
418
+ },
419
+ {
420
+ "epoch": 4.08,
421
+ "learning_rate": 1.1926803013993542e-05,
422
+ "loss": 0.0035,
423
+ "step": 30500
424
+ },
425
+ {
426
+ "epoch": 4.14,
427
+ "learning_rate": 1.1792249730893435e-05,
428
+ "loss": 0.0037,
429
+ "step": 31000
430
+ },
431
+ {
432
+ "epoch": 4.21,
433
+ "learning_rate": 1.1657696447793328e-05,
434
+ "loss": 0.0039,
435
+ "step": 31500
436
+ },
437
+ {
438
+ "epoch": 4.28,
439
+ "learning_rate": 1.152314316469322e-05,
440
+ "loss": 0.0037,
441
+ "step": 32000
442
+ },
443
+ {
444
+ "epoch": 4.34,
445
+ "learning_rate": 1.1388589881593113e-05,
446
+ "loss": 0.0038,
447
+ "step": 32500
448
+ },
449
+ {
450
+ "epoch": 4.41,
451
+ "learning_rate": 1.1254036598493004e-05,
452
+ "loss": 0.004,
453
+ "step": 33000
454
+ },
455
+ {
456
+ "epoch": 4.48,
457
+ "learning_rate": 1.1119483315392897e-05,
458
+ "loss": 0.0039,
459
+ "step": 33500
460
+ },
461
+ {
462
+ "epoch": 4.54,
463
+ "learning_rate": 1.0984930032292788e-05,
464
+ "loss": 0.004,
465
+ "step": 34000
466
+ },
467
+ {
468
+ "epoch": 4.61,
469
+ "learning_rate": 1.0850376749192681e-05,
470
+ "loss": 0.0041,
471
+ "step": 34500
472
+ },
473
+ {
474
+ "epoch": 4.68,
475
+ "learning_rate": 1.0715823466092574e-05,
476
+ "loss": 0.0042,
477
+ "step": 35000
478
+ },
479
+ {
480
+ "epoch": 4.74,
481
+ "learning_rate": 1.0581270182992467e-05,
482
+ "loss": 0.004,
483
+ "step": 35500
484
+ },
485
+ {
486
+ "epoch": 4.81,
487
+ "learning_rate": 1.0446716899892357e-05,
488
+ "loss": 0.0042,
489
+ "step": 36000
490
+ },
491
+ {
492
+ "epoch": 4.88,
493
+ "learning_rate": 1.031216361679225e-05,
494
+ "loss": 0.0041,
495
+ "step": 36500
496
+ },
497
+ {
498
+ "epoch": 4.95,
499
+ "learning_rate": 1.0177610333692143e-05,
500
+ "loss": 0.0042,
501
+ "step": 37000
502
+ },
503
+ {
504
+ "epoch": 5.0,
505
+ "eval_accuracy": 0.9940509055275022,
506
+ "eval_f1": 0.0,
507
+ "eval_loss": 0.025542501360177994,
508
+ "eval_precision": 0.0,
509
+ "eval_recall": 0.0,
510
+ "eval_runtime": 71.0096,
511
+ "eval_samples_per_second": 709.806,
512
+ "eval_steps_per_second": 5.549,
513
+ "step": 37410
514
+ },
515
+ {
516
+ "epoch": 5.01,
517
+ "learning_rate": 1.0043057050592036e-05,
518
+ "loss": 0.0038,
519
+ "step": 37500
520
+ },
521
+ {
522
+ "epoch": 5.08,
523
+ "learning_rate": 9.908503767491927e-06,
524
+ "loss": 0.0029,
525
+ "step": 38000
526
+ },
527
+ {
528
+ "epoch": 5.15,
529
+ "learning_rate": 9.77395048439182e-06,
530
+ "loss": 0.003,
531
+ "step": 38500
532
+ },
533
+ {
534
+ "epoch": 5.21,
535
+ "learning_rate": 9.639397201291713e-06,
536
+ "loss": 0.003,
537
+ "step": 39000
538
+ },
539
+ {
540
+ "epoch": 5.28,
541
+ "learning_rate": 9.504843918191604e-06,
542
+ "loss": 0.003,
543
+ "step": 39500
544
+ },
545
+ {
546
+ "epoch": 5.35,
547
+ "learning_rate": 9.370290635091497e-06,
548
+ "loss": 0.0029,
549
+ "step": 40000
550
+ },
551
+ {
552
+ "epoch": 5.41,
553
+ "learning_rate": 9.23573735199139e-06,
554
+ "loss": 0.0031,
555
+ "step": 40500
556
+ },
557
+ {
558
+ "epoch": 5.48,
559
+ "learning_rate": 9.101184068891282e-06,
560
+ "loss": 0.0031,
561
+ "step": 41000
562
+ },
563
+ {
564
+ "epoch": 5.55,
565
+ "learning_rate": 8.966630785791175e-06,
566
+ "loss": 0.0031,
567
+ "step": 41500
568
+ },
569
+ {
570
+ "epoch": 5.61,
571
+ "learning_rate": 8.832077502691066e-06,
572
+ "loss": 0.0032,
573
+ "step": 42000
574
+ },
575
+ {
576
+ "epoch": 5.68,
577
+ "learning_rate": 8.697524219590959e-06,
578
+ "loss": 0.003,
579
+ "step": 42500
580
+ },
581
+ {
582
+ "epoch": 5.75,
583
+ "learning_rate": 8.56297093649085e-06,
584
+ "loss": 0.0032,
585
+ "step": 43000
586
+ },
587
+ {
588
+ "epoch": 5.81,
589
+ "learning_rate": 8.428417653390743e-06,
590
+ "loss": 0.0033,
591
+ "step": 43500
592
+ },
593
+ {
594
+ "epoch": 5.88,
595
+ "learning_rate": 8.293864370290636e-06,
596
+ "loss": 0.0032,
597
+ "step": 44000
598
+ },
599
+ {
600
+ "epoch": 5.95,
601
+ "learning_rate": 8.159311087190527e-06,
602
+ "loss": 0.0032,
603
+ "step": 44500
604
+ },
605
+ {
606
+ "epoch": 6.0,
607
+ "eval_accuracy": 0.9940655314512207,
608
+ "eval_f1": 0.0,
609
+ "eval_loss": 0.031192485243082047,
610
+ "eval_precision": 0.0,
611
+ "eval_recall": 0.0,
612
+ "eval_runtime": 81.6971,
613
+ "eval_samples_per_second": 616.95,
614
+ "eval_steps_per_second": 4.823,
615
+ "step": 44892
616
+ },
617
+ {
618
+ "epoch": 6.01,
619
+ "learning_rate": 8.02475780409042e-06,
620
+ "loss": 0.003,
621
+ "step": 45000
622
+ },
623
+ {
624
+ "epoch": 6.08,
625
+ "learning_rate": 7.890204520990313e-06,
626
+ "loss": 0.0022,
627
+ "step": 45500
628
+ },
629
+ {
630
+ "epoch": 6.15,
631
+ "learning_rate": 7.755651237890205e-06,
632
+ "loss": 0.0024,
633
+ "step": 46000
634
+ },
635
+ {
636
+ "epoch": 6.21,
637
+ "learning_rate": 7.621097954790098e-06,
638
+ "loss": 0.0022,
639
+ "step": 46500
640
+ },
641
+ {
642
+ "epoch": 6.28,
643
+ "learning_rate": 7.486544671689991e-06,
644
+ "loss": 0.0024,
645
+ "step": 47000
646
+ },
647
+ {
648
+ "epoch": 6.35,
649
+ "learning_rate": 7.351991388589882e-06,
650
+ "loss": 0.0025,
651
+ "step": 47500
652
+ },
653
+ {
654
+ "epoch": 6.42,
655
+ "learning_rate": 7.217438105489775e-06,
656
+ "loss": 0.0025,
657
+ "step": 48000
658
+ },
659
+ {
660
+ "epoch": 6.48,
661
+ "learning_rate": 7.082884822389667e-06,
662
+ "loss": 0.0025,
663
+ "step": 48500
664
+ },
665
+ {
666
+ "epoch": 6.55,
667
+ "learning_rate": 6.948331539289559e-06,
668
+ "loss": 0.0026,
669
+ "step": 49000
670
+ },
671
+ {
672
+ "epoch": 6.62,
673
+ "learning_rate": 6.813778256189451e-06,
674
+ "loss": 0.0025,
675
+ "step": 49500
676
+ },
677
+ {
678
+ "epoch": 6.68,
679
+ "learning_rate": 6.679224973089344e-06,
680
+ "loss": 0.0026,
681
+ "step": 50000
682
+ },
683
+ {
684
+ "epoch": 6.75,
685
+ "learning_rate": 6.544671689989236e-06,
686
+ "loss": 0.0025,
687
+ "step": 50500
688
+ },
689
+ {
690
+ "epoch": 6.82,
691
+ "learning_rate": 6.410118406889129e-06,
692
+ "loss": 0.0025,
693
+ "step": 51000
694
+ },
695
+ {
696
+ "epoch": 6.88,
697
+ "learning_rate": 6.275565123789022e-06,
698
+ "loss": 0.0025,
699
+ "step": 51500
700
+ },
701
+ {
702
+ "epoch": 6.95,
703
+ "learning_rate": 6.141011840688913e-06,
704
+ "loss": 0.0025,
705
+ "step": 52000
706
+ },
707
+ {
708
+ "epoch": 7.0,
709
+ "eval_accuracy": 0.9939119592521766,
710
+ "eval_f1": 0.0,
711
+ "eval_loss": 0.03196028992533684,
712
+ "eval_precision": 0.0,
713
+ "eval_recall": 0.0,
714
+ "eval_runtime": 71.5312,
715
+ "eval_samples_per_second": 704.63,
716
+ "eval_steps_per_second": 5.508,
717
+ "step": 52374
718
+ }
719
+ ],
720
+ "logging_steps": 500,
721
+ "max_steps": 74820,
722
+ "num_train_epochs": 10,
723
+ "save_steps": 500,
724
+ "total_flos": 5.95287517459002e+17,
725
+ "trial_name": null,
726
+ "trial_params": null
727
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-52374/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d99c525306a2f5415aced53b4f26a4f948a39d19ab91beee79f05a3c0d2dc91
3
+ size 1418281093
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4584cb25d54c7bc100b750d5185dab318806f87dc469c72235b959bd7ea4852c
3
+ size 709125289
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d630713282a7e689b0e08e8ab7654d39f94f47b8f456bac345c21dba19706091
3
+ size 14575
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67fdbe2ad084cae31bd619a47ba1b70ba128ff67dd25af4b47b1bd149313fe24
3
+ size 627
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/trainer_state.json ADDED
@@ -0,0 +1,829 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 8.0,
5
+ "eval_steps": 500,
6
+ "global_step": 59856,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.07,
13
+ "learning_rate": 2e-05,
14
+ "loss": 0.1264,
15
+ "step": 500
16
+ },
17
+ {
18
+ "epoch": 0.13,
19
+ "learning_rate": 1.9865446716899895e-05,
20
+ "loss": 0.0227,
21
+ "step": 1000
22
+ },
23
+ {
24
+ "epoch": 0.2,
25
+ "learning_rate": 1.9730893433799788e-05,
26
+ "loss": 0.0203,
27
+ "step": 1500
28
+ },
29
+ {
30
+ "epoch": 0.27,
31
+ "learning_rate": 1.959634015069968e-05,
32
+ "loss": 0.0197,
33
+ "step": 2000
34
+ },
35
+ {
36
+ "epoch": 0.33,
37
+ "learning_rate": 1.946178686759957e-05,
38
+ "loss": 0.0185,
39
+ "step": 2500
40
+ },
41
+ {
42
+ "epoch": 0.4,
43
+ "learning_rate": 1.9327233584499463e-05,
44
+ "loss": 0.0177,
45
+ "step": 3000
46
+ },
47
+ {
48
+ "epoch": 0.47,
49
+ "learning_rate": 1.9192680301399356e-05,
50
+ "loss": 0.0181,
51
+ "step": 3500
52
+ },
53
+ {
54
+ "epoch": 0.53,
55
+ "learning_rate": 1.905812701829925e-05,
56
+ "loss": 0.0171,
57
+ "step": 4000
58
+ },
59
+ {
60
+ "epoch": 0.6,
61
+ "learning_rate": 1.8923573735199142e-05,
62
+ "loss": 0.0174,
63
+ "step": 4500
64
+ },
65
+ {
66
+ "epoch": 0.67,
67
+ "learning_rate": 1.8789020452099035e-05,
68
+ "loss": 0.017,
69
+ "step": 5000
70
+ },
71
+ {
72
+ "epoch": 0.74,
73
+ "learning_rate": 1.8654467168998925e-05,
74
+ "loss": 0.0168,
75
+ "step": 5500
76
+ },
77
+ {
78
+ "epoch": 0.8,
79
+ "learning_rate": 1.8519913885898818e-05,
80
+ "loss": 0.0161,
81
+ "step": 6000
82
+ },
83
+ {
84
+ "epoch": 0.87,
85
+ "learning_rate": 1.838536060279871e-05,
86
+ "loss": 0.0156,
87
+ "step": 6500
88
+ },
89
+ {
90
+ "epoch": 0.94,
91
+ "learning_rate": 1.8250807319698604e-05,
92
+ "loss": 0.0167,
93
+ "step": 7000
94
+ },
95
+ {
96
+ "epoch": 1.0,
97
+ "eval_accuracy": 0.9943580499255906,
98
+ "eval_f1": 0.0,
99
+ "eval_loss": 0.015657523646950722,
100
+ "eval_precision": 0.0,
101
+ "eval_recall": 0.0,
102
+ "eval_runtime": 71.099,
103
+ "eval_samples_per_second": 708.913,
104
+ "eval_steps_per_second": 5.542,
105
+ "step": 7482
106
+ },
107
+ {
108
+ "epoch": 1.0,
109
+ "learning_rate": 1.8116254036598497e-05,
110
+ "loss": 0.0152,
111
+ "step": 7500
112
+ },
113
+ {
114
+ "epoch": 1.07,
115
+ "learning_rate": 1.7981700753498386e-05,
116
+ "loss": 0.0119,
117
+ "step": 8000
118
+ },
119
+ {
120
+ "epoch": 1.14,
121
+ "learning_rate": 1.784714747039828e-05,
122
+ "loss": 0.0116,
123
+ "step": 8500
124
+ },
125
+ {
126
+ "epoch": 1.2,
127
+ "learning_rate": 1.7712594187298172e-05,
128
+ "loss": 0.0116,
129
+ "step": 9000
130
+ },
131
+ {
132
+ "epoch": 1.27,
133
+ "learning_rate": 1.7578040904198065e-05,
134
+ "loss": 0.0114,
135
+ "step": 9500
136
+ },
137
+ {
138
+ "epoch": 1.34,
139
+ "learning_rate": 1.7443487621097955e-05,
140
+ "loss": 0.0114,
141
+ "step": 10000
142
+ },
143
+ {
144
+ "epoch": 1.4,
145
+ "learning_rate": 1.7308934337997848e-05,
146
+ "loss": 0.0114,
147
+ "step": 10500
148
+ },
149
+ {
150
+ "epoch": 1.47,
151
+ "learning_rate": 1.717438105489774e-05,
152
+ "loss": 0.0118,
153
+ "step": 11000
154
+ },
155
+ {
156
+ "epoch": 1.54,
157
+ "learning_rate": 1.7039827771797634e-05,
158
+ "loss": 0.0119,
159
+ "step": 11500
160
+ },
161
+ {
162
+ "epoch": 1.6,
163
+ "learning_rate": 1.6905274488697524e-05,
164
+ "loss": 0.0111,
165
+ "step": 12000
166
+ },
167
+ {
168
+ "epoch": 1.67,
169
+ "learning_rate": 1.6770721205597417e-05,
170
+ "loss": 0.0118,
171
+ "step": 12500
172
+ },
173
+ {
174
+ "epoch": 1.74,
175
+ "learning_rate": 1.663616792249731e-05,
176
+ "loss": 0.0116,
177
+ "step": 13000
178
+ },
179
+ {
180
+ "epoch": 1.8,
181
+ "learning_rate": 1.6501614639397203e-05,
182
+ "loss": 0.0112,
183
+ "step": 13500
184
+ },
185
+ {
186
+ "epoch": 1.87,
187
+ "learning_rate": 1.6367061356297096e-05,
188
+ "loss": 0.0114,
189
+ "step": 14000
190
+ },
191
+ {
192
+ "epoch": 1.94,
193
+ "learning_rate": 1.6232508073196985e-05,
194
+ "loss": 0.0115,
195
+ "step": 14500
196
+ },
197
+ {
198
+ "epoch": 2.0,
199
+ "eval_accuracy": 0.9940472490465726,
200
+ "eval_f1": 0.0,
201
+ "eval_loss": 0.016565397381782532,
202
+ "eval_precision": 0.0,
203
+ "eval_recall": 0.0,
204
+ "eval_runtime": 82.6741,
205
+ "eval_samples_per_second": 609.659,
206
+ "eval_steps_per_second": 4.766,
207
+ "step": 14964
208
+ },
209
+ {
210
+ "epoch": 2.0,
211
+ "learning_rate": 1.6097954790096878e-05,
212
+ "loss": 0.0109,
213
+ "step": 15000
214
+ },
215
+ {
216
+ "epoch": 2.07,
217
+ "learning_rate": 1.596340150699677e-05,
218
+ "loss": 0.0077,
219
+ "step": 15500
220
+ },
221
+ {
222
+ "epoch": 2.14,
223
+ "learning_rate": 1.5828848223896664e-05,
224
+ "loss": 0.0075,
225
+ "step": 16000
226
+ },
227
+ {
228
+ "epoch": 2.21,
229
+ "learning_rate": 1.5694294940796557e-05,
230
+ "loss": 0.0076,
231
+ "step": 16500
232
+ },
233
+ {
234
+ "epoch": 2.27,
235
+ "learning_rate": 1.555974165769645e-05,
236
+ "loss": 0.0076,
237
+ "step": 17000
238
+ },
239
+ {
240
+ "epoch": 2.34,
241
+ "learning_rate": 1.542518837459634e-05,
242
+ "loss": 0.0077,
243
+ "step": 17500
244
+ },
245
+ {
246
+ "epoch": 2.41,
247
+ "learning_rate": 1.5290635091496233e-05,
248
+ "loss": 0.0077,
249
+ "step": 18000
250
+ },
251
+ {
252
+ "epoch": 2.47,
253
+ "learning_rate": 1.5156081808396126e-05,
254
+ "loss": 0.0076,
255
+ "step": 18500
256
+ },
257
+ {
258
+ "epoch": 2.54,
259
+ "learning_rate": 1.5021528525296019e-05,
260
+ "loss": 0.0078,
261
+ "step": 19000
262
+ },
263
+ {
264
+ "epoch": 2.61,
265
+ "learning_rate": 1.4886975242195912e-05,
266
+ "loss": 0.008,
267
+ "step": 19500
268
+ },
269
+ {
270
+ "epoch": 2.67,
271
+ "learning_rate": 1.4752421959095805e-05,
272
+ "loss": 0.0077,
273
+ "step": 20000
274
+ },
275
+ {
276
+ "epoch": 2.74,
277
+ "learning_rate": 1.4617868675995694e-05,
278
+ "loss": 0.0078,
279
+ "step": 20500
280
+ },
281
+ {
282
+ "epoch": 2.81,
283
+ "learning_rate": 1.4483315392895587e-05,
284
+ "loss": 0.0081,
285
+ "step": 21000
286
+ },
287
+ {
288
+ "epoch": 2.87,
289
+ "learning_rate": 1.434876210979548e-05,
290
+ "loss": 0.0081,
291
+ "step": 21500
292
+ },
293
+ {
294
+ "epoch": 2.94,
295
+ "learning_rate": 1.4214208826695373e-05,
296
+ "loss": 0.0078,
297
+ "step": 22000
298
+ },
299
+ {
300
+ "epoch": 3.0,
301
+ "eval_accuracy": 0.9940838138558689,
302
+ "eval_f1": 0.0,
303
+ "eval_loss": 0.020097261294722557,
304
+ "eval_precision": 0.0,
305
+ "eval_recall": 0.0,
306
+ "eval_runtime": 71.581,
307
+ "eval_samples_per_second": 704.139,
308
+ "eval_steps_per_second": 5.504,
309
+ "step": 22446
310
+ },
311
+ {
312
+ "epoch": 3.01,
313
+ "learning_rate": 1.4079655543595265e-05,
314
+ "loss": 0.0074,
315
+ "step": 22500
316
+ },
317
+ {
318
+ "epoch": 3.07,
319
+ "learning_rate": 1.3945102260495158e-05,
320
+ "loss": 0.0051,
321
+ "step": 23000
322
+ },
323
+ {
324
+ "epoch": 3.14,
325
+ "learning_rate": 1.3810548977395049e-05,
326
+ "loss": 0.0052,
327
+ "step": 23500
328
+ },
329
+ {
330
+ "epoch": 3.21,
331
+ "learning_rate": 1.3675995694294942e-05,
332
+ "loss": 0.005,
333
+ "step": 24000
334
+ },
335
+ {
336
+ "epoch": 3.27,
337
+ "learning_rate": 1.3541442411194833e-05,
338
+ "loss": 0.0056,
339
+ "step": 24500
340
+ },
341
+ {
342
+ "epoch": 3.34,
343
+ "learning_rate": 1.3406889128094726e-05,
344
+ "loss": 0.0049,
345
+ "step": 25000
346
+ },
347
+ {
348
+ "epoch": 3.41,
349
+ "learning_rate": 1.3272335844994619e-05,
350
+ "loss": 0.0054,
351
+ "step": 25500
352
+ },
353
+ {
354
+ "epoch": 3.48,
355
+ "learning_rate": 1.3137782561894512e-05,
356
+ "loss": 0.0052,
357
+ "step": 26000
358
+ },
359
+ {
360
+ "epoch": 3.54,
361
+ "learning_rate": 1.3003229278794403e-05,
362
+ "loss": 0.0055,
363
+ "step": 26500
364
+ },
365
+ {
366
+ "epoch": 3.61,
367
+ "learning_rate": 1.2868675995694295e-05,
368
+ "loss": 0.0053,
369
+ "step": 27000
370
+ },
371
+ {
372
+ "epoch": 3.68,
373
+ "learning_rate": 1.2734122712594188e-05,
374
+ "loss": 0.0055,
375
+ "step": 27500
376
+ },
377
+ {
378
+ "epoch": 3.74,
379
+ "learning_rate": 1.259956942949408e-05,
380
+ "loss": 0.0056,
381
+ "step": 28000
382
+ },
383
+ {
384
+ "epoch": 3.81,
385
+ "learning_rate": 1.2465016146393974e-05,
386
+ "loss": 0.0056,
387
+ "step": 28500
388
+ },
389
+ {
390
+ "epoch": 3.88,
391
+ "learning_rate": 1.2330462863293867e-05,
392
+ "loss": 0.0056,
393
+ "step": 29000
394
+ },
395
+ {
396
+ "epoch": 3.94,
397
+ "learning_rate": 1.219590958019376e-05,
398
+ "loss": 0.0056,
399
+ "step": 29500
400
+ },
401
+ {
402
+ "epoch": 4.0,
403
+ "eval_accuracy": 0.9940216536800652,
404
+ "eval_f1": 0.0,
405
+ "eval_loss": 0.023068198934197426,
406
+ "eval_precision": 0.0,
407
+ "eval_recall": 0.0,
408
+ "eval_runtime": 81.8028,
409
+ "eval_samples_per_second": 616.153,
410
+ "eval_steps_per_second": 4.816,
411
+ "step": 29928
412
+ },
413
+ {
414
+ "epoch": 4.01,
415
+ "learning_rate": 1.206135629709365e-05,
416
+ "loss": 0.0053,
417
+ "step": 30000
418
+ },
419
+ {
420
+ "epoch": 4.08,
421
+ "learning_rate": 1.1926803013993542e-05,
422
+ "loss": 0.0035,
423
+ "step": 30500
424
+ },
425
+ {
426
+ "epoch": 4.14,
427
+ "learning_rate": 1.1792249730893435e-05,
428
+ "loss": 0.0037,
429
+ "step": 31000
430
+ },
431
+ {
432
+ "epoch": 4.21,
433
+ "learning_rate": 1.1657696447793328e-05,
434
+ "loss": 0.0039,
435
+ "step": 31500
436
+ },
437
+ {
438
+ "epoch": 4.28,
439
+ "learning_rate": 1.152314316469322e-05,
440
+ "loss": 0.0037,
441
+ "step": 32000
442
+ },
443
+ {
444
+ "epoch": 4.34,
445
+ "learning_rate": 1.1388589881593113e-05,
446
+ "loss": 0.0038,
447
+ "step": 32500
448
+ },
449
+ {
450
+ "epoch": 4.41,
451
+ "learning_rate": 1.1254036598493004e-05,
452
+ "loss": 0.004,
453
+ "step": 33000
454
+ },
455
+ {
456
+ "epoch": 4.48,
457
+ "learning_rate": 1.1119483315392897e-05,
458
+ "loss": 0.0039,
459
+ "step": 33500
460
+ },
461
+ {
462
+ "epoch": 4.54,
463
+ "learning_rate": 1.0984930032292788e-05,
464
+ "loss": 0.004,
465
+ "step": 34000
466
+ },
467
+ {
468
+ "epoch": 4.61,
469
+ "learning_rate": 1.0850376749192681e-05,
470
+ "loss": 0.0041,
471
+ "step": 34500
472
+ },
473
+ {
474
+ "epoch": 4.68,
475
+ "learning_rate": 1.0715823466092574e-05,
476
+ "loss": 0.0042,
477
+ "step": 35000
478
+ },
479
+ {
480
+ "epoch": 4.74,
481
+ "learning_rate": 1.0581270182992467e-05,
482
+ "loss": 0.004,
483
+ "step": 35500
484
+ },
485
+ {
486
+ "epoch": 4.81,
487
+ "learning_rate": 1.0446716899892357e-05,
488
+ "loss": 0.0042,
489
+ "step": 36000
490
+ },
491
+ {
492
+ "epoch": 4.88,
493
+ "learning_rate": 1.031216361679225e-05,
494
+ "loss": 0.0041,
495
+ "step": 36500
496
+ },
497
+ {
498
+ "epoch": 4.95,
499
+ "learning_rate": 1.0177610333692143e-05,
500
+ "loss": 0.0042,
501
+ "step": 37000
502
+ },
503
+ {
504
+ "epoch": 5.0,
505
+ "eval_accuracy": 0.9940509055275022,
506
+ "eval_f1": 0.0,
507
+ "eval_loss": 0.025542501360177994,
508
+ "eval_precision": 0.0,
509
+ "eval_recall": 0.0,
510
+ "eval_runtime": 71.0096,
511
+ "eval_samples_per_second": 709.806,
512
+ "eval_steps_per_second": 5.549,
513
+ "step": 37410
514
+ },
515
+ {
516
+ "epoch": 5.01,
517
+ "learning_rate": 1.0043057050592036e-05,
518
+ "loss": 0.0038,
519
+ "step": 37500
520
+ },
521
+ {
522
+ "epoch": 5.08,
523
+ "learning_rate": 9.908503767491927e-06,
524
+ "loss": 0.0029,
525
+ "step": 38000
526
+ },
527
+ {
528
+ "epoch": 5.15,
529
+ "learning_rate": 9.77395048439182e-06,
530
+ "loss": 0.003,
531
+ "step": 38500
532
+ },
533
+ {
534
+ "epoch": 5.21,
535
+ "learning_rate": 9.639397201291713e-06,
536
+ "loss": 0.003,
537
+ "step": 39000
538
+ },
539
+ {
540
+ "epoch": 5.28,
541
+ "learning_rate": 9.504843918191604e-06,
542
+ "loss": 0.003,
543
+ "step": 39500
544
+ },
545
+ {
546
+ "epoch": 5.35,
547
+ "learning_rate": 9.370290635091497e-06,
548
+ "loss": 0.0029,
549
+ "step": 40000
550
+ },
551
+ {
552
+ "epoch": 5.41,
553
+ "learning_rate": 9.23573735199139e-06,
554
+ "loss": 0.0031,
555
+ "step": 40500
556
+ },
557
+ {
558
+ "epoch": 5.48,
559
+ "learning_rate": 9.101184068891282e-06,
560
+ "loss": 0.0031,
561
+ "step": 41000
562
+ },
563
+ {
564
+ "epoch": 5.55,
565
+ "learning_rate": 8.966630785791175e-06,
566
+ "loss": 0.0031,
567
+ "step": 41500
568
+ },
569
+ {
570
+ "epoch": 5.61,
571
+ "learning_rate": 8.832077502691066e-06,
572
+ "loss": 0.0032,
573
+ "step": 42000
574
+ },
575
+ {
576
+ "epoch": 5.68,
577
+ "learning_rate": 8.697524219590959e-06,
578
+ "loss": 0.003,
579
+ "step": 42500
580
+ },
581
+ {
582
+ "epoch": 5.75,
583
+ "learning_rate": 8.56297093649085e-06,
584
+ "loss": 0.0032,
585
+ "step": 43000
586
+ },
587
+ {
588
+ "epoch": 5.81,
589
+ "learning_rate": 8.428417653390743e-06,
590
+ "loss": 0.0033,
591
+ "step": 43500
592
+ },
593
+ {
594
+ "epoch": 5.88,
595
+ "learning_rate": 8.293864370290636e-06,
596
+ "loss": 0.0032,
597
+ "step": 44000
598
+ },
599
+ {
600
+ "epoch": 5.95,
601
+ "learning_rate": 8.159311087190527e-06,
602
+ "loss": 0.0032,
603
+ "step": 44500
604
+ },
605
+ {
606
+ "epoch": 6.0,
607
+ "eval_accuracy": 0.9940655314512207,
608
+ "eval_f1": 0.0,
609
+ "eval_loss": 0.031192485243082047,
610
+ "eval_precision": 0.0,
611
+ "eval_recall": 0.0,
612
+ "eval_runtime": 81.6971,
613
+ "eval_samples_per_second": 616.95,
614
+ "eval_steps_per_second": 4.823,
615
+ "step": 44892
616
+ },
617
+ {
618
+ "epoch": 6.01,
619
+ "learning_rate": 8.02475780409042e-06,
620
+ "loss": 0.003,
621
+ "step": 45000
622
+ },
623
+ {
624
+ "epoch": 6.08,
625
+ "learning_rate": 7.890204520990313e-06,
626
+ "loss": 0.0022,
627
+ "step": 45500
628
+ },
629
+ {
630
+ "epoch": 6.15,
631
+ "learning_rate": 7.755651237890205e-06,
632
+ "loss": 0.0024,
633
+ "step": 46000
634
+ },
635
+ {
636
+ "epoch": 6.21,
637
+ "learning_rate": 7.621097954790098e-06,
638
+ "loss": 0.0022,
639
+ "step": 46500
640
+ },
641
+ {
642
+ "epoch": 6.28,
643
+ "learning_rate": 7.486544671689991e-06,
644
+ "loss": 0.0024,
645
+ "step": 47000
646
+ },
647
+ {
648
+ "epoch": 6.35,
649
+ "learning_rate": 7.351991388589882e-06,
650
+ "loss": 0.0025,
651
+ "step": 47500
652
+ },
653
+ {
654
+ "epoch": 6.42,
655
+ "learning_rate": 7.217438105489775e-06,
656
+ "loss": 0.0025,
657
+ "step": 48000
658
+ },
659
+ {
660
+ "epoch": 6.48,
661
+ "learning_rate": 7.082884822389667e-06,
662
+ "loss": 0.0025,
663
+ "step": 48500
664
+ },
665
+ {
666
+ "epoch": 6.55,
667
+ "learning_rate": 6.948331539289559e-06,
668
+ "loss": 0.0026,
669
+ "step": 49000
670
+ },
671
+ {
672
+ "epoch": 6.62,
673
+ "learning_rate": 6.813778256189451e-06,
674
+ "loss": 0.0025,
675
+ "step": 49500
676
+ },
677
+ {
678
+ "epoch": 6.68,
679
+ "learning_rate": 6.679224973089344e-06,
680
+ "loss": 0.0026,
681
+ "step": 50000
682
+ },
683
+ {
684
+ "epoch": 6.75,
685
+ "learning_rate": 6.544671689989236e-06,
686
+ "loss": 0.0025,
687
+ "step": 50500
688
+ },
689
+ {
690
+ "epoch": 6.82,
691
+ "learning_rate": 6.410118406889129e-06,
692
+ "loss": 0.0025,
693
+ "step": 51000
694
+ },
695
+ {
696
+ "epoch": 6.88,
697
+ "learning_rate": 6.275565123789022e-06,
698
+ "loss": 0.0025,
699
+ "step": 51500
700
+ },
701
+ {
702
+ "epoch": 6.95,
703
+ "learning_rate": 6.141011840688913e-06,
704
+ "loss": 0.0025,
705
+ "step": 52000
706
+ },
707
+ {
708
+ "epoch": 7.0,
709
+ "eval_accuracy": 0.9939119592521766,
710
+ "eval_f1": 0.0,
711
+ "eval_loss": 0.03196028992533684,
712
+ "eval_precision": 0.0,
713
+ "eval_recall": 0.0,
714
+ "eval_runtime": 71.5312,
715
+ "eval_samples_per_second": 704.63,
716
+ "eval_steps_per_second": 5.508,
717
+ "step": 52374
718
+ },
719
+ {
720
+ "epoch": 7.02,
721
+ "learning_rate": 6.006458557588806e-06,
722
+ "loss": 0.0023,
723
+ "step": 52500
724
+ },
725
+ {
726
+ "epoch": 7.08,
727
+ "learning_rate": 5.871905274488698e-06,
728
+ "loss": 0.002,
729
+ "step": 53000
730
+ },
731
+ {
732
+ "epoch": 7.15,
733
+ "learning_rate": 5.73735199138859e-06,
734
+ "loss": 0.0019,
735
+ "step": 53500
736
+ },
737
+ {
738
+ "epoch": 7.22,
739
+ "learning_rate": 5.602798708288482e-06,
740
+ "loss": 0.002,
741
+ "step": 54000
742
+ },
743
+ {
744
+ "epoch": 7.28,
745
+ "learning_rate": 5.468245425188375e-06,
746
+ "loss": 0.002,
747
+ "step": 54500
748
+ },
749
+ {
750
+ "epoch": 7.35,
751
+ "learning_rate": 5.333692142088267e-06,
752
+ "loss": 0.0019,
753
+ "step": 55000
754
+ },
755
+ {
756
+ "epoch": 7.42,
757
+ "learning_rate": 5.19913885898816e-06,
758
+ "loss": 0.002,
759
+ "step": 55500
760
+ },
761
+ {
762
+ "epoch": 7.48,
763
+ "learning_rate": 5.064585575888053e-06,
764
+ "loss": 0.0018,
765
+ "step": 56000
766
+ },
767
+ {
768
+ "epoch": 7.55,
769
+ "learning_rate": 4.930032292787945e-06,
770
+ "loss": 0.0022,
771
+ "step": 56500
772
+ },
773
+ {
774
+ "epoch": 7.62,
775
+ "learning_rate": 4.795479009687837e-06,
776
+ "loss": 0.002,
777
+ "step": 57000
778
+ },
779
+ {
780
+ "epoch": 7.69,
781
+ "learning_rate": 4.660925726587729e-06,
782
+ "loss": 0.002,
783
+ "step": 57500
784
+ },
785
+ {
786
+ "epoch": 7.75,
787
+ "learning_rate": 4.526372443487621e-06,
788
+ "loss": 0.0021,
789
+ "step": 58000
790
+ },
791
+ {
792
+ "epoch": 7.82,
793
+ "learning_rate": 4.391819160387513e-06,
794
+ "loss": 0.0021,
795
+ "step": 58500
796
+ },
797
+ {
798
+ "epoch": 7.89,
799
+ "learning_rate": 4.2572658772874056e-06,
800
+ "loss": 0.0021,
801
+ "step": 59000
802
+ },
803
+ {
804
+ "epoch": 7.95,
805
+ "learning_rate": 4.1227125941872986e-06,
806
+ "loss": 0.0023,
807
+ "step": 59500
808
+ },
809
+ {
810
+ "epoch": 8.0,
811
+ "eval_accuracy": 0.9939704629470505,
812
+ "eval_f1": 0.0,
813
+ "eval_loss": 0.03371906280517578,
814
+ "eval_precision": 0.0,
815
+ "eval_recall": 0.0,
816
+ "eval_runtime": 81.4913,
817
+ "eval_samples_per_second": 618.508,
818
+ "eval_steps_per_second": 4.835,
819
+ "step": 59856
820
+ }
821
+ ],
822
+ "logging_steps": 500,
823
+ "max_steps": 74820,
824
+ "num_train_epochs": 10,
825
+ "save_steps": 500,
826
+ "total_flos": 6.803285913817166e+17,
827
+ "trial_name": null,
828
+ "trial_params": null
829
+ }
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-59856/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f713488e20afd5141b39f9aaa67feba7da0a360f752dbaa308013d8d5212f449
3
+ size 4091
rubert-base-cased-conversational-512-tatoeba_dataset/02-09-2023-11-01-00/checkpoint-67338/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepPavlov/rubert-base-cased-conversational",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "U"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "O": 0,
20
+ "U": 1
21
+ },
22
+ "layer_norm_eps": 1e-12,
23
+ "max_position_embeddings": 512,
24
+ "model_type": "bert",
25
+ "num_attention_heads": 12,
26
+ "num_hidden_layers": 12,
27
+ "output_past": true,
28
+ "pad_token_id": 0,
29
+ "pooler_fc_size": 768,
30
+ "pooler_num_attention_heads": 12,
31
+ "pooler_num_fc_layers": 3,
32
+ "pooler_size_per_head": 128,
33
+ "pooler_type": "first_token_transform",
34
+ "position_embedding_type": "absolute",
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.1",
37
+ "type_vocab_size": 2,
38
+ "use_cache": true,
39
+ "vocab_size": 119547
40
+ }