ArabovMK commited on
Commit
8885ab1
·
verified ·
1 Parent(s): d9aca72

Upload training_summary.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. training_summary.json +318 -0
training_summary.json ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bpe": {
3
+ "v8000_mf2": {
4
+ "out_dir": "results\\bpe\\v8000_mf2",
5
+ "metrics": {
6
+ "oov_rate": 0.0,
7
+ "avg_sequence_length": 96.0113,
8
+ "avg_processing_time_ms": 0.19588143825531007,
9
+ "compression_ratio": 96.0113,
10
+ "total_tokens_evaluated": 1920226,
11
+ "unk_count": 0,
12
+ "train_time_s": 105.87230825424194,
13
+ "config": {
14
+ "vocab_size": 8000,
15
+ "min_frequency": 2,
16
+ "continuing_subword_prefix": "##"
17
+ }
18
+ }
19
+ },
20
+ "v8000_mf5": {
21
+ "out_dir": "results\\bpe\\v8000_mf5",
22
+ "metrics": {
23
+ "oov_rate": 0.0,
24
+ "avg_sequence_length": 96.0113,
25
+ "avg_processing_time_ms": 0.19508297443389894,
26
+ "compression_ratio": 96.0113,
27
+ "total_tokens_evaluated": 1920226,
28
+ "unk_count": 0,
29
+ "train_time_s": 115.85335993766785,
30
+ "config": {
31
+ "vocab_size": 8000,
32
+ "min_frequency": 5,
33
+ "continuing_subword_prefix": "##"
34
+ }
35
+ }
36
+ },
37
+ "v16000_mf2": {
38
+ "out_dir": "results\\bpe\\v16000_mf2",
39
+ "metrics": {
40
+ "oov_rate": 0.0,
41
+ "avg_sequence_length": 84.86375,
42
+ "avg_processing_time_ms": 0.1791509985923767,
43
+ "compression_ratio": 84.86375,
44
+ "total_tokens_evaluated": 1697275,
45
+ "unk_count": 0,
46
+ "train_time_s": 122.03794264793396,
47
+ "config": {
48
+ "vocab_size": 16000,
49
+ "min_frequency": 2,
50
+ "continuing_subword_prefix": "##"
51
+ }
52
+ }
53
+ },
54
+ "v16000_mf5": {
55
+ "out_dir": "results\\bpe\\v16000_mf5",
56
+ "metrics": {
57
+ "oov_rate": 0.0,
58
+ "avg_sequence_length": 84.86375,
59
+ "avg_processing_time_ms": 0.1843635559082031,
60
+ "compression_ratio": 84.86375,
61
+ "total_tokens_evaluated": 1697275,
62
+ "unk_count": 0,
63
+ "train_time_s": 119.14113140106201,
64
+ "config": {
65
+ "vocab_size": 16000,
66
+ "min_frequency": 5,
67
+ "continuing_subword_prefix": "##"
68
+ }
69
+ }
70
+ },
71
+ "v32000_mf2": {
72
+ "out_dir": "results\\bpe\\v32000_mf2",
73
+ "metrics": {
74
+ "oov_rate": 0.0,
75
+ "avg_sequence_length": 77.17065,
76
+ "avg_processing_time_ms": 0.18579285144805907,
77
+ "compression_ratio": 77.17065,
78
+ "total_tokens_evaluated": 1543413,
79
+ "unk_count": 0,
80
+ "train_time_s": 122.94540190696716,
81
+ "config": {
82
+ "vocab_size": 32000,
83
+ "min_frequency": 2,
84
+ "continuing_subword_prefix": "##"
85
+ }
86
+ }
87
+ },
88
+ "v32000_mf5": {
89
+ "out_dir": "results\\bpe\\v32000_mf5",
90
+ "metrics": {
91
+ "oov_rate": 0.0,
92
+ "avg_sequence_length": 77.17065,
93
+ "avg_processing_time_ms": 0.1811486840248108,
94
+ "compression_ratio": 77.17065,
95
+ "total_tokens_evaluated": 1543413,
96
+ "unk_count": 0,
97
+ "train_time_s": 122.62627506256104,
98
+ "config": {
99
+ "vocab_size": 32000,
100
+ "min_frequency": 5,
101
+ "continuing_subword_prefix": "##"
102
+ }
103
+ }
104
+ }
105
+ },
106
+ "wordpiece": {
107
+ "v8000_mf1": {
108
+ "out_dir": "results\\wordpiece\\v8000_mf1",
109
+ "metrics": {
110
+ "oov_rate": 0.0,
111
+ "avg_sequence_length": 95.39795,
112
+ "avg_processing_time_ms": 31.364226222038273,
113
+ "compression_ratio": 95.39795,
114
+ "total_tokens_evaluated": 1907959,
115
+ "unk_count": 0,
116
+ "train_time_s": 124.3489019870758,
117
+ "config": {
118
+ "vocab_size": 8000,
119
+ "min_frequency": 1
120
+ }
121
+ }
122
+ },
123
+ "v8000_mf2": {
124
+ "out_dir": "results\\wordpiece\\v8000_mf2",
125
+ "metrics": {
126
+ "oov_rate": 0.0,
127
+ "avg_sequence_length": 95.39795,
128
+ "avg_processing_time_ms": 0.22379395961761475,
129
+ "compression_ratio": 95.39795,
130
+ "total_tokens_evaluated": 1907959,
131
+ "unk_count": 0,
132
+ "train_time_s": 176.4660017490387,
133
+ "config": {
134
+ "vocab_size": 8000,
135
+ "min_frequency": 2
136
+ }
137
+ }
138
+ },
139
+ "v16000_mf1": {
140
+ "out_dir": "results\\wordpiece\\v16000_mf1",
141
+ "metrics": {
142
+ "oov_rate": 0.0,
143
+ "avg_sequence_length": 84.55695,
144
+ "avg_processing_time_ms": 0.2237707018852234,
145
+ "compression_ratio": 84.55695,
146
+ "total_tokens_evaluated": 1691139,
147
+ "unk_count": 0,
148
+ "train_time_s": 184.54623937606812,
149
+ "config": {
150
+ "vocab_size": 16000,
151
+ "min_frequency": 1
152
+ }
153
+ }
154
+ },
155
+ "v16000_mf2": {
156
+ "out_dir": "results\\wordpiece\\v16000_mf2",
157
+ "metrics": {
158
+ "oov_rate": 0.0,
159
+ "avg_sequence_length": 84.55695,
160
+ "avg_processing_time_ms": 0.2417303204536438,
161
+ "compression_ratio": 84.55695,
162
+ "total_tokens_evaluated": 1691139,
163
+ "unk_count": 0,
164
+ "train_time_s": 318.9338138103485,
165
+ "config": {
166
+ "vocab_size": 16000,
167
+ "min_frequency": 2
168
+ }
169
+ }
170
+ },
171
+ "v32000_mf1": {
172
+ "out_dir": "results\\wordpiece\\v32000_mf1",
173
+ "metrics": {
174
+ "oov_rate": 0.0,
175
+ "avg_sequence_length": 76.92375,
176
+ "avg_processing_time_ms": 0.2857889056205749,
177
+ "compression_ratio": 76.92375,
178
+ "total_tokens_evaluated": 1538475,
179
+ "unk_count": 0,
180
+ "train_time_s": 158.26075053215027,
181
+ "config": {
182
+ "vocab_size": 32000,
183
+ "min_frequency": 1
184
+ }
185
+ }
186
+ },
187
+ "v32000_mf2": {
188
+ "out_dir": "results\\wordpiece\\v32000_mf2",
189
+ "metrics": {
190
+ "oov_rate": 0.0,
191
+ "avg_sequence_length": 76.92375,
192
+ "avg_processing_time_ms": 0.518797504901886,
193
+ "compression_ratio": 76.92375,
194
+ "total_tokens_evaluated": 1538475,
195
+ "unk_count": 0,
196
+ "train_time_s": 157.1074833869934,
197
+ "config": {
198
+ "vocab_size": 32000,
199
+ "min_frequency": 2
200
+ }
201
+ }
202
+ }
203
+ },
204
+ "unigram": {
205
+ "v8000": {
206
+ "out_dir": "results\\unigram\\v8000",
207
+ "metrics": {
208
+ "oov_rate": 0.0,
209
+ "avg_sequence_length": 101.5805,
210
+ "avg_processing_time_ms": 0.3227068305015564,
211
+ "compression_ratio": 101.5805,
212
+ "total_tokens_evaluated": 2031610,
213
+ "unk_count": 0,
214
+ "train_time_s": 601.7949032783508,
215
+ "config": {
216
+ "vocab_size": 8000
217
+ }
218
+ }
219
+ },
220
+ "v16000": {
221
+ "out_dir": "results\\unigram\\v16000",
222
+ "metrics": {
223
+ "oov_rate": 0.0,
224
+ "avg_sequence_length": 90.8909,
225
+ "avg_processing_time_ms": 0.29166127443313594,
226
+ "compression_ratio": 90.8909,
227
+ "total_tokens_evaluated": 1817818,
228
+ "unk_count": 0,
229
+ "train_time_s": 614.1360929012299,
230
+ "config": {
231
+ "vocab_size": 16000
232
+ }
233
+ }
234
+ },
235
+ "v32000": {
236
+ "out_dir": "results\\unigram\\v32000",
237
+ "metrics": {
238
+ "oov_rate": 0.0,
239
+ "avg_sequence_length": 83.3668,
240
+ "avg_processing_time_ms": 0.32854799032211307,
241
+ "compression_ratio": 83.3668,
242
+ "total_tokens_evaluated": 1667336,
243
+ "unk_count": 0,
244
+ "train_time_s": 757.2155563831329,
245
+ "config": {
246
+ "vocab_size": 32000
247
+ }
248
+ }
249
+ }
250
+ },
251
+ "spm": {
252
+ "v8000": {
253
+ "out_dir": "results\\spm_unigram\\v8000",
254
+ "metrics": {
255
+ "oov_rate": 0.0,
256
+ "avg_sequence_length": 107.90535,
257
+ "avg_processing_time_ms": 0.11760829687118529,
258
+ "compression_ratio": 107.90535,
259
+ "total_tokens_evaluated": 2158107,
260
+ "unk_count": 0,
261
+ "unk_piece_used": "[UNK]",
262
+ "train_time_s": 343.80153012275696,
263
+ "config": {
264
+ "vocab_size": 8000
265
+ }
266
+ }
267
+ },
268
+ "v16000": {
269
+ "out_dir": "results\\spm_unigram\\v16000",
270
+ "metrics": {
271
+ "oov_rate": 0.0,
272
+ "avg_sequence_length": 95.67175,
273
+ "avg_processing_time_ms": 0.160364830493927,
274
+ "compression_ratio": 95.67175,
275
+ "total_tokens_evaluated": 1913435,
276
+ "unk_count": 0,
277
+ "unk_piece_used": "[UNK]",
278
+ "train_time_s": 477.8609836101532,
279
+ "config": {
280
+ "vocab_size": 16000
281
+ }
282
+ }
283
+ },
284
+ "v32000": {
285
+ "out_dir": "results\\spm_unigram\\v32000",
286
+ "metrics": {
287
+ "oov_rate": 0.0,
288
+ "avg_sequence_length": 86.6945,
289
+ "avg_processing_time_ms": 0.1026016116142273,
290
+ "compression_ratio": 86.6945,
291
+ "total_tokens_evaluated": 1733890,
292
+ "unk_count": 0,
293
+ "unk_piece_used": "[UNK]",
294
+ "train_time_s": 249.83488726615906,
295
+ "config": {
296
+ "vocab_size": 32000
297
+ }
298
+ }
299
+ }
300
+ },
301
+ "metadata": {
302
+ "corpus_path": "full_tatar_raw_corpus_clean.txt",
303
+ "vocab_sizes": [
304
+ 8000,
305
+ 16000,
306
+ 32000
307
+ ],
308
+ "sample_size": 20000,
309
+ "seed": 42,
310
+ "selected_models": [
311
+ "bpe",
312
+ "wordpiece",
313
+ "unigram",
314
+ "spm"
315
+ ],
316
+ "timestamp": "2025-11-19 21:10:06"
317
+ }
318
+ }