Removed toneneziers
Browse files- fasttext/cbow100/ft_cbow_100.model +0 -3
- fasttext/cbow100/ft_cbow_100.model.syn1neg.npy +0 -3
- fasttext/cbow100/ft_cbow_100.model.wv.vectors_ngrams.npy +0 -3
- fasttext/cbow100/ft_cbow_100.model.wv.vectors_vocab.npy +0 -3
- fasttext/cbow200/ft_cbow_200.model +0 -3
- fasttext/cbow200/ft_cbow_200.model.syn1neg.npy +0 -3
- fasttext/cbow200/ft_cbow_200.model.wv.vectors_ngrams.npy +0 -3
- fasttext/cbow200/ft_cbow_200.model.wv.vectors_vocab.npy +0 -3
- test_results/comparison_en.png +0 -3
- test_results/comparison_ru.png +0 -3
- test_results/correlation_en.png +0 -3
- test_results/correlation_ru.png +0 -3
- test_results/test_summary_20260310_202532.txt +0 -17
- test_results/token_length_dist_en.png +0 -3
- test_results/token_length_dist_ru.png +0 -3
- test_results/tokenizer_test_report.csv +0 -11
- test_results/top10_score_en.png +0 -0
- test_results/top10_score_ru.png +0 -0
- tokenizers/bbpe/25k/bbpe_25k.json +0 -0
- tokenizers/bbpe/50k/bbpe_50k.json +0 -0
- tokenizers/bbpe_fixed/50k/bbpe_fixed_50k.json +0 -0
- tokenizers/bpe/50k/bpe_50k.json +0 -0
- tokenizers/bpe/50k_freq5/bpe_50k_freq5.json +0 -0
- tokenizers/bpe_fixed/50k/bpe_fixed_50k.json +0 -0
- tokenizers/unigram/25k/uni_25k.json +0 -0
- tokenizers/unigram/50k/uni_50k.json +0 -0
- tokenizers/wordpiece/25k/wp_25k.json +0 -0
- tokenizers/wordpiece/50k/wp_50k.json +0 -0
- word2vec/cbow100/w2v_cbow_100.model +0 -3
- word2vec/cbow100/w2v_cbow_100.model.syn1neg.npy +0 -3
- word2vec/cbow100/w2v_cbow_100.model.wv.vectors.npy +0 -3
- word2vec/cbow200/w2v_cbow_200.model +0 -3
- word2vec/cbow200/w2v_cbow_200.model.syn1neg.npy +0 -3
- word2vec/cbow200/w2v_cbow_200.model.wv.vectors.npy +0 -3
- word2vec/sg100/w2v_sg_100.model.syn1neg.npy +0 -3
- word2vec/sg100/w2v_sg_100.model.wv.vectors.npy +0 -3
fasttext/cbow100/ft_cbow_100.model
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4d45a01420c254e66221425256da8aa42d1cffee9af3475535b7db0a426ae14f
|
| 3 |
-
size 54568486
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow100/ft_cbow_100.model.syn1neg.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:79eea0361a28ea1303645c3185fce538af71f51dadce9b5d45f3dc6f761df388
|
| 3 |
-
size 517596928
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow100/ft_cbow_100.model.wv.vectors_ngrams.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a69298272c0ef4a39ed8ffcdce80ec6b3c7a37a216e300a152d999245551680c
|
| 3 |
-
size 800000128
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow100/ft_cbow_100.model.wv.vectors_vocab.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:cba3542af85af6e3fe2ea61650618719516ef35d4d08ee9800f6ea05503861a3
|
| 3 |
-
size 517596928
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow200/ft_cbow_200.model
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:24cc8a81946a6ceae195073da91ef6fe73035c8208875809a4f1712d531dd38c
|
| 3 |
-
size 54568486
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow200/ft_cbow_200.model.syn1neg.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9e901475e8da553f113c3cbd27cc168fd64056267b1849d55f3899272fdc1091
|
| 3 |
-
size 1035193728
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow200/ft_cbow_200.model.wv.vectors_ngrams.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:71e20e73b7556ca923fbae63f4f8585d20b48f4c9505bdbaf213c0abdd0d08f3
|
| 3 |
-
size 1600000128
|
|
|
|
|
|
|
|
|
|
|
|
fasttext/cbow200/ft_cbow_200.model.wv.vectors_vocab.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d92479d52f020ffbbbd53d4b7a26036bec3ff3cc35a6487df9253ee0fad419ec
|
| 3 |
-
size 1035193728
|
|
|
|
|
|
|
|
|
|
|
|
test_results/comparison_en.png
DELETED
Git LFS Details
|
test_results/comparison_ru.png
DELETED
Git LFS Details
|
test_results/correlation_en.png
DELETED
Git LFS Details
|
test_results/correlation_ru.png
DELETED
Git LFS Details
|
test_results/test_summary_20260310_202532.txt
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
============================================================
|
| 2 |
-
ИТОГОВЫЙ ОТЧЁТ ПО ТЕСТИРОВАНИЮ ТОКЕНИЗАТОРОВ
|
| 3 |
-
Дата: 2026-03-10 20:25:32.743689
|
| 4 |
-
Тестовый корпус: 10000 документов, 19479650 символов
|
| 5 |
-
============================================================
|
| 6 |
-
|
| 7 |
-
ЛУЧШИЕ МОДЕЛИ ПО КОМПОЗИТНОЙ ОЦЕНКЕ:
|
| 8 |
-
4. bbpe_fixed_50k (BPE_fixed)
|
| 9 |
-
unknown_rate=0.0000, compression=5.17, word_coverage=1.0000, speed=315922
|
| 10 |
-
2. wp_50k (WordPiece)
|
| 11 |
-
unknown_rate=0.0000, compression=4.67, word_coverage=1.0000, speed=378751
|
| 12 |
-
3. bpe_50k (BPE)
|
| 13 |
-
unknown_rate=0.0000, compression=4.60, word_coverage=1.0000, speed=247421
|
| 14 |
-
7. bpe_50k_freq5 (BPE)
|
| 15 |
-
unknown_rate=0.0000, compression=4.60, word_coverage=1.0000, speed=226591
|
| 16 |
-
5. bbpe_50k (BBPE)
|
| 17 |
-
unknown_rate=0.0000, compression=4.60, word_coverage=1.0000, speed=227322
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_results/token_length_dist_en.png
DELETED
Git LFS Details
|
test_results/token_length_dist_ru.png
DELETED
Git LFS Details
|
test_results/tokenizer_test_report.csv
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
name,type,vocab_size,time_100k_chars,tokens_per_second_100k,model_size_mb,unknown_rate,avg_tokens_per_doc,avg_token_length,median_token_length,max_token_length,word_coverage,compression_ratio,unique_tokens_used,vocabulary_usage,top_tokens,top_token_freq,rare_token_ratio
|
| 2 |
-
bbpe_25k,BBPE,25000,0.08356547355651855,257103.79042450906,2.4044103622436523,0.0,748.822,7.796199897973083,7.0,61,1.0,4.279625865693048,19990,0.7996,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', '!', 'Ġ«']","[20580, 19456, 4561, 4077, 2813, 2728, 2485, 2285, 1864, 1864]",0.15887943971985993
|
| 3 |
-
wp_50k,WordPiece,50000,0.05166983604431152,378750.95990660717,1.3837089538574219,0.0,685.8,4.344056576261301,4.0,30,1.0,4.672904636920385,30188,0.60376,"['.', ',', '—', '-', ':', 'да', 'белән', 'дә', '«', '!']","[20885, 19721, 4748, 4432, 2972, 2722, 2468, 2281, 1975, 1947]",0.2781237577845502
|
| 4 |
-
bpe_50k,BPE,50000,0.08070063591003418,247420.60300812745,5.065661430358887,0.0,696.054,8.388668120576853,8.0,61,1.0,4.604065201837789,30107,0.60214,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', 'Ġ«', '!']","[20578, 19453, 4556, 4054, 2812, 2711, 2469, 2284, 1874, 1862]",0.2897664994851696
|
| 5 |
-
bbpe_fixed_50k,BPE_fixed,50000,0.0562291145324707,315921.74530405947,3.993025779724121,0.0,619.666,5.173235258994361,5.0,32,1.0,5.171621486413649,32068,0.64136,"['▁—', '▁белән', '▁да', '▁дә', '▁һәм', '▁иде.', '▁бер', '▁ул', '▁дип', '▁–']","[4356, 2379, 2140, 1862, 1535, 1116, 1091, 1090, 1000, 908]",0.2763502557066234
|
| 6 |
-
bbpe_50k,BBPE,50000,0.08784890174865723,227322.13610519317,5.047884941101074,0.0,696.454,8.38241434466598,8.0,61,1.0,4.601420912221052,30049,0.60098,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', 'Ġ«', '!']","[20578, 19453, 4556, 4054, 2812, 2711, 2469, 2284, 1863, 1862]",0.2870311824020766
|
| 7 |
-
bpe_fixed_50k,BPE_fixed,50000,0.057566165924072266,337246.7088672603,3.8777475357055664,0.0014830942091103511,674.266,4.092767542779852,4.0,30,0.6354397643096147,4.752839383863342,28498,0.56996,"['.', ',', '—', '-', 'да', ':', 'белән', 'дә', '«', '!']","[20885, 19724, 4750, 4432, 2998, 2972, 2618, 2483, 1990, 1948]",0.29777528247596324
|
| 8 |
-
bpe_50k_freq5,BPE,50000,0.0881190299987793,226591.23687898743,5.065661430358887,0.0,696.054,8.388668120576853,8.0,61,1.0,4.604065201837789,30107,0.60214,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', 'Ġ«', '!']","[20578, 19453, 4556, 4054, 2812, 2711, 2469, 2284, 1874, 1862]",0.2897664994851696
|
| 9 |
-
wp_25k,WordPiece,25000,0.042369842529296875,496272.7908526155,0.6622514724731445,0.0,735.806,4.184752502697722,4.0,30,1.0,4.3553300734160905,19895,0.7958,"['.', ',', '—', '-', ':', 'да', 'белән', 'дә', '«', '!']","[20887, 19722, 4753, 4437, 2972, 2739, 2486, 2288, 1976, 1949]",0.1456144759989947
|
| 10 |
-
uni_50k,Unigram,50000,0.10636377334594727,189622.8327139213,3.413966178894043,0.0,697.688,4.594715689534577,4.0,16,1.0,4.593282384102923,29572,0.59144,"['.', ',', '▁', '▁—', '▁да', '▁белән', '▁дә', ':', '!', '▁иде']","[19939, 19165, 8524, 4617, 2781, 2483, 2329, 1825, 1754, 1658]",0.29061274178276747
|
| 11 |
-
uni_25k,Unigram,25000,0.08225703239440918,260403.26737408372,1.6880054473876953,0.0,745.354,4.300879850379819,4.0,16,1.0,4.29953820600681,19610,0.7844,"['.', ',', '▁', '▁—', '▁да', '▁белән', '▁дә', ':', '!', '▁иде']","[19521, 18909, 8112, 4635, 2795, 2488, 2352, 1834, 1833, 1671]",0.16471188169301376
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_results/top10_score_en.png
DELETED
|
Binary file (39 kB)
|
|
|
test_results/top10_score_ru.png
DELETED
|
Binary file (39.4 kB)
|
|
|
tokenizers/bbpe/25k/bbpe_25k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/bbpe/50k/bbpe_50k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/bbpe_fixed/50k/bbpe_fixed_50k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/bpe/50k/bpe_50k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/bpe/50k_freq5/bpe_50k_freq5.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/bpe_fixed/50k/bpe_fixed_50k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/unigram/25k/uni_25k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/unigram/50k/uni_50k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/wordpiece/25k/wp_25k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizers/wordpiece/50k/wp_50k.json
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
word2vec/cbow100/w2v_cbow_100.model
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:3bb6db8fe225765005f76ff9f87e3e5dd6bc842badaf33d8088f2cd008075356
|
| 3 |
-
size 54568363
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/cbow100/w2v_cbow_100.model.syn1neg.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:75da397d98f88cabc1baedcb652110763ea1aafc652b2d54bc85b70d397b72e9
|
| 3 |
-
size 517596928
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/cbow100/w2v_cbow_100.model.wv.vectors.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2df494227c658b2deb338f69b0ba73e48dae87a7e2a66ad1f198064ddd7445ab
|
| 3 |
-
size 517596928
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/cbow200/w2v_cbow_200.model
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:5f79189d0a68eeaabcb08f68b9b452cfd77bb6e62c757569eb837c2f93056660
|
| 3 |
-
size 54568363
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/cbow200/w2v_cbow_200.model.syn1neg.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d2b1ab3bce9ca229219a97c1cb299651b0277c746c2940604606f84f384806f9
|
| 3 |
-
size 1035193728
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/cbow200/w2v_cbow_200.model.wv.vectors.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9d05832cd6efc2bd7c4ca203c9af672ed9929f4f2c4211952dab1974d52a9b31
|
| 3 |
-
size 1035193728
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/sg100/w2v_sg_100.model.syn1neg.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:ff76e398e5a015f25f18505fe9249feb917d9574d3dfa180aef19cc21fc2b1e0
|
| 3 |
-
size 517596928
|
|
|
|
|
|
|
|
|
|
|
|
word2vec/sg100/w2v_sg_100.model.wv.vectors.npy
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e3789bf1978ceae2bc6d3acac17e0cf51c6e40bf3e287c67e8a5e754b58620ef
|
| 3 |
-
size 517596928
|
|
|
|
|
|
|
|
|
|
|
|