Tatar
ArabovMK commited on
Commit
f7d18a9
·
1 Parent(s): f0f6a8f

Removed toneneziers

Browse files
Files changed (36) hide show
  1. fasttext/cbow100/ft_cbow_100.model +0 -3
  2. fasttext/cbow100/ft_cbow_100.model.syn1neg.npy +0 -3
  3. fasttext/cbow100/ft_cbow_100.model.wv.vectors_ngrams.npy +0 -3
  4. fasttext/cbow100/ft_cbow_100.model.wv.vectors_vocab.npy +0 -3
  5. fasttext/cbow200/ft_cbow_200.model +0 -3
  6. fasttext/cbow200/ft_cbow_200.model.syn1neg.npy +0 -3
  7. fasttext/cbow200/ft_cbow_200.model.wv.vectors_ngrams.npy +0 -3
  8. fasttext/cbow200/ft_cbow_200.model.wv.vectors_vocab.npy +0 -3
  9. test_results/comparison_en.png +0 -3
  10. test_results/comparison_ru.png +0 -3
  11. test_results/correlation_en.png +0 -3
  12. test_results/correlation_ru.png +0 -3
  13. test_results/test_summary_20260310_202532.txt +0 -17
  14. test_results/token_length_dist_en.png +0 -3
  15. test_results/token_length_dist_ru.png +0 -3
  16. test_results/tokenizer_test_report.csv +0 -11
  17. test_results/top10_score_en.png +0 -0
  18. test_results/top10_score_ru.png +0 -0
  19. tokenizers/bbpe/25k/bbpe_25k.json +0 -0
  20. tokenizers/bbpe/50k/bbpe_50k.json +0 -0
  21. tokenizers/bbpe_fixed/50k/bbpe_fixed_50k.json +0 -0
  22. tokenizers/bpe/50k/bpe_50k.json +0 -0
  23. tokenizers/bpe/50k_freq5/bpe_50k_freq5.json +0 -0
  24. tokenizers/bpe_fixed/50k/bpe_fixed_50k.json +0 -0
  25. tokenizers/unigram/25k/uni_25k.json +0 -0
  26. tokenizers/unigram/50k/uni_50k.json +0 -0
  27. tokenizers/wordpiece/25k/wp_25k.json +0 -0
  28. tokenizers/wordpiece/50k/wp_50k.json +0 -0
  29. word2vec/cbow100/w2v_cbow_100.model +0 -3
  30. word2vec/cbow100/w2v_cbow_100.model.syn1neg.npy +0 -3
  31. word2vec/cbow100/w2v_cbow_100.model.wv.vectors.npy +0 -3
  32. word2vec/cbow200/w2v_cbow_200.model +0 -3
  33. word2vec/cbow200/w2v_cbow_200.model.syn1neg.npy +0 -3
  34. word2vec/cbow200/w2v_cbow_200.model.wv.vectors.npy +0 -3
  35. word2vec/sg100/w2v_sg_100.model.syn1neg.npy +0 -3
  36. word2vec/sg100/w2v_sg_100.model.wv.vectors.npy +0 -3
fasttext/cbow100/ft_cbow_100.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4d45a01420c254e66221425256da8aa42d1cffee9af3475535b7db0a426ae14f
3
- size 54568486
 
 
 
 
fasttext/cbow100/ft_cbow_100.model.syn1neg.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:79eea0361a28ea1303645c3185fce538af71f51dadce9b5d45f3dc6f761df388
3
- size 517596928
 
 
 
 
fasttext/cbow100/ft_cbow_100.model.wv.vectors_ngrams.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a69298272c0ef4a39ed8ffcdce80ec6b3c7a37a216e300a152d999245551680c
3
- size 800000128
 
 
 
 
fasttext/cbow100/ft_cbow_100.model.wv.vectors_vocab.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cba3542af85af6e3fe2ea61650618719516ef35d4d08ee9800f6ea05503861a3
3
- size 517596928
 
 
 
 
fasttext/cbow200/ft_cbow_200.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:24cc8a81946a6ceae195073da91ef6fe73035c8208875809a4f1712d531dd38c
3
- size 54568486
 
 
 
 
fasttext/cbow200/ft_cbow_200.model.syn1neg.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e901475e8da553f113c3cbd27cc168fd64056267b1849d55f3899272fdc1091
3
- size 1035193728
 
 
 
 
fasttext/cbow200/ft_cbow_200.model.wv.vectors_ngrams.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:71e20e73b7556ca923fbae63f4f8585d20b48f4c9505bdbaf213c0abdd0d08f3
3
- size 1600000128
 
 
 
 
fasttext/cbow200/ft_cbow_200.model.wv.vectors_vocab.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d92479d52f020ffbbbd53d4b7a26036bec3ff3cc35a6487df9253ee0fad419ec
3
- size 1035193728
 
 
 
 
test_results/comparison_en.png DELETED

Git LFS Details

  • SHA256: d6a3a315dcf3e07ac62b3de79cb5811ef9fa5948ba4e4ca0dee249545a98e586
  • Pointer size: 131 Bytes
  • Size of remote file: 142 kB
test_results/comparison_ru.png DELETED

Git LFS Details

  • SHA256: 641a5a2bee0fb61415aad40a2474474a3f6a62436f45912719977930fee84615
  • Pointer size: 131 Bytes
  • Size of remote file: 145 kB
test_results/correlation_en.png DELETED

Git LFS Details

  • SHA256: cbd40939608f69b91f4d87a42b55923b62b13a2f534c9474c49b5219c47ea317
  • Pointer size: 131 Bytes
  • Size of remote file: 136 kB
test_results/correlation_ru.png DELETED

Git LFS Details

  • SHA256: 007c669a03d14e6474d1bed0402999456b6c4774f6cf89655094ec87044fa875
  • Pointer size: 131 Bytes
  • Size of remote file: 136 kB
test_results/test_summary_20260310_202532.txt DELETED
@@ -1,17 +0,0 @@
1
- ============================================================
2
- ИТОГОВЫЙ ОТЧЁТ ПО ТЕСТИРОВАНИЮ ТОКЕНИЗАТОРОВ
3
- Дата: 2026-03-10 20:25:32.743689
4
- Тестовый корпус: 10000 документов, 19479650 символов
5
- ============================================================
6
-
7
- ЛУЧШИЕ МОДЕЛИ ПО КОМПОЗИТНОЙ ОЦЕНКЕ:
8
- 4. bbpe_fixed_50k (BPE_fixed)
9
- unknown_rate=0.0000, compression=5.17, word_coverage=1.0000, speed=315922
10
- 2. wp_50k (WordPiece)
11
- unknown_rate=0.0000, compression=4.67, word_coverage=1.0000, speed=378751
12
- 3. bpe_50k (BPE)
13
- unknown_rate=0.0000, compression=4.60, word_coverage=1.0000, speed=247421
14
- 7. bpe_50k_freq5 (BPE)
15
- unknown_rate=0.0000, compression=4.60, word_coverage=1.0000, speed=226591
16
- 5. bbpe_50k (BBPE)
17
- unknown_rate=0.0000, compression=4.60, word_coverage=1.0000, speed=227322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_results/token_length_dist_en.png DELETED

Git LFS Details

  • SHA256: 9de600ad2e7683ec482f39f92a0cd38505264c824c45c0d17296b0270be5337d
  • Pointer size: 131 Bytes
  • Size of remote file: 128 kB
test_results/token_length_dist_ru.png DELETED

Git LFS Details

  • SHA256: 298e70b0f9a7ae2bd07f7198266f8976f128e881c33d5d8279937447d82ab1dd
  • Pointer size: 131 Bytes
  • Size of remote file: 125 kB
test_results/tokenizer_test_report.csv DELETED
@@ -1,11 +0,0 @@
1
- name,type,vocab_size,time_100k_chars,tokens_per_second_100k,model_size_mb,unknown_rate,avg_tokens_per_doc,avg_token_length,median_token_length,max_token_length,word_coverage,compression_ratio,unique_tokens_used,vocabulary_usage,top_tokens,top_token_freq,rare_token_ratio
2
- bbpe_25k,BBPE,25000,0.08356547355651855,257103.79042450906,2.4044103622436523,0.0,748.822,7.796199897973083,7.0,61,1.0,4.279625865693048,19990,0.7996,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', '!', 'Ġ«']","[20580, 19456, 4561, 4077, 2813, 2728, 2485, 2285, 1864, 1864]",0.15887943971985993
3
- wp_50k,WordPiece,50000,0.05166983604431152,378750.95990660717,1.3837089538574219,0.0,685.8,4.344056576261301,4.0,30,1.0,4.672904636920385,30188,0.60376,"['.', ',', '—', '-', ':', 'да', 'белән', 'дә', '«', '!']","[20885, 19721, 4748, 4432, 2972, 2722, 2468, 2281, 1975, 1947]",0.2781237577845502
4
- bpe_50k,BPE,50000,0.08070063591003418,247420.60300812745,5.065661430358887,0.0,696.054,8.388668120576853,8.0,61,1.0,4.604065201837789,30107,0.60214,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', 'Ġ«', '!']","[20578, 19453, 4556, 4054, 2812, 2711, 2469, 2284, 1874, 1862]",0.2897664994851696
5
- bbpe_fixed_50k,BPE_fixed,50000,0.0562291145324707,315921.74530405947,3.993025779724121,0.0,619.666,5.173235258994361,5.0,32,1.0,5.171621486413649,32068,0.64136,"['▁—', '▁белән', '▁да', '▁дә', '▁һәм', '▁иде.', '▁бер', '▁ул', '▁дип', '▁–']","[4356, 2379, 2140, 1862, 1535, 1116, 1091, 1090, 1000, 908]",0.2763502557066234
6
- bbpe_50k,BBPE,50000,0.08784890174865723,227322.13610519317,5.047884941101074,0.0,696.454,8.38241434466598,8.0,61,1.0,4.601420912221052,30049,0.60098,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', 'Ġ«', '!']","[20578, 19453, 4556, 4054, 2812, 2711, 2469, 2284, 1863, 1862]",0.2870311824020766
7
- bpe_fixed_50k,BPE_fixed,50000,0.057566165924072266,337246.7088672603,3.8777475357055664,0.0014830942091103511,674.266,4.092767542779852,4.0,30,0.6354397643096147,4.752839383863342,28498,0.56996,"['.', ',', '—', '-', 'да', ':', 'белән', 'дә', '«', '!']","[20885, 19724, 4750, 4432, 2998, 2972, 2618, 2483, 1990, 1948]",0.29777528247596324
8
- bpe_50k_freq5,BPE,50000,0.0881190299987793,226591.23687898743,5.065661430358887,0.0,696.054,8.388668120576853,8.0,61,1.0,4.604065201837789,30107,0.60214,"['.', ',', 'ĠâĢĶ', '-', ':', 'Ġда', 'ĠбелÓĻн', 'ĠдÓĻ', 'Ġ«', '!']","[20578, 19453, 4556, 4054, 2812, 2711, 2469, 2284, 1874, 1862]",0.2897664994851696
9
- wp_25k,WordPiece,25000,0.042369842529296875,496272.7908526155,0.6622514724731445,0.0,735.806,4.184752502697722,4.0,30,1.0,4.3553300734160905,19895,0.7958,"['.', ',', '—', '-', ':', 'да', 'белән', 'дә', '«', '!']","[20887, 19722, 4753, 4437, 2972, 2739, 2486, 2288, 1976, 1949]",0.1456144759989947
10
- uni_50k,Unigram,50000,0.10636377334594727,189622.8327139213,3.413966178894043,0.0,697.688,4.594715689534577,4.0,16,1.0,4.593282384102923,29572,0.59144,"['.', ',', '▁', '▁—', '▁да', '▁белән', '▁дә', ':', '!', '▁иде']","[19939, 19165, 8524, 4617, 2781, 2483, 2329, 1825, 1754, 1658]",0.29061274178276747
11
- uni_25k,Unigram,25000,0.08225703239440918,260403.26737408372,1.6880054473876953,0.0,745.354,4.300879850379819,4.0,16,1.0,4.29953820600681,19610,0.7844,"['.', ',', '▁', '▁—', '▁да', '▁белән', '▁дә', ':', '!', '▁иде']","[19521, 18909, 8112, 4635, 2795, 2488, 2352, 1834, 1833, 1671]",0.16471188169301376
 
 
 
 
 
 
 
 
 
 
 
 
test_results/top10_score_en.png DELETED
Binary file (39 kB)
 
test_results/top10_score_ru.png DELETED
Binary file (39.4 kB)
 
tokenizers/bbpe/25k/bbpe_25k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/bbpe/50k/bbpe_50k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/bbpe_fixed/50k/bbpe_fixed_50k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/bpe/50k/bpe_50k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/bpe/50k_freq5/bpe_50k_freq5.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/bpe_fixed/50k/bpe_fixed_50k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/unigram/25k/uni_25k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/unigram/50k/uni_50k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/wordpiece/25k/wp_25k.json DELETED
The diff for this file is too large to render. See raw diff
 
tokenizers/wordpiece/50k/wp_50k.json DELETED
The diff for this file is too large to render. See raw diff
 
word2vec/cbow100/w2v_cbow_100.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3bb6db8fe225765005f76ff9f87e3e5dd6bc842badaf33d8088f2cd008075356
3
- size 54568363
 
 
 
 
word2vec/cbow100/w2v_cbow_100.model.syn1neg.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:75da397d98f88cabc1baedcb652110763ea1aafc652b2d54bc85b70d397b72e9
3
- size 517596928
 
 
 
 
word2vec/cbow100/w2v_cbow_100.model.wv.vectors.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2df494227c658b2deb338f69b0ba73e48dae87a7e2a66ad1f198064ddd7445ab
3
- size 517596928
 
 
 
 
word2vec/cbow200/w2v_cbow_200.model DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f79189d0a68eeaabcb08f68b9b452cfd77bb6e62c757569eb837c2f93056660
3
- size 54568363
 
 
 
 
word2vec/cbow200/w2v_cbow_200.model.syn1neg.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2b1ab3bce9ca229219a97c1cb299651b0277c746c2940604606f84f384806f9
3
- size 1035193728
 
 
 
 
word2vec/cbow200/w2v_cbow_200.model.wv.vectors.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d05832cd6efc2bd7c4ca203c9af672ed9929f4f2c4211952dab1974d52a9b31
3
- size 1035193728
 
 
 
 
word2vec/sg100/w2v_sg_100.model.syn1neg.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ff76e398e5a015f25f18505fe9249feb917d9574d3dfa180aef19cc21fc2b1e0
3
- size 517596928
 
 
 
 
word2vec/sg100/w2v_sg_100.model.wv.vectors.npy DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3789bf1978ceae2bc6d3acac17e0cf51c6e40bf3e287c67e8a5e754b58620ef
3
- size 517596928