5dimension commited on
Commit
3824578
·
verified ·
1 Parent(s): bb85012

🦴 v2.0: 65K text vocab, 30 languages, 300K+ samples

Browse files
Files changed (3) hide show
  1. benchmark_results.json +21 -64
  2. sentinel_manifold.json +42 -27
  3. tokenizer.json +0 -0
benchmark_results.json CHANGED
@@ -1,71 +1,28 @@
1
  {
2
- "sentinel_tokenizer": {
3
- "vocab_size": 61440,
4
- "text_vocab": 32768,
5
- "image_codebook": 16384,
6
- "audio_codebook": 8192,
7
- "video_codebook": 4096,
8
- "metrics": {
9
- "avg_fertility": 9.13065205232572,
10
- "std_fertility": 16.348063069521316,
11
- "avg_compression": 3.5456289797801976,
12
- "fairness": 0.057643322830483165
13
- }
14
- },
15
- "comparisons": {
16
- "GPT-2 (50K)": {
17
- "avg_fertility": 20.85785254531753,
18
- "std_fertility": 40.76486672709434,
19
- "avg_compression": 2.4054180948259107,
20
- "fairness": 0.023943569760064974
21
  },
22
- "Gemma (256K)": {
23
- "avg_fertility": 6.688784516655667,
24
- "std_fertility": 11.713991856851852,
25
- "avg_compression": 4.660773272747129,
26
- "fairness": 0.07865350326310598
27
  },
28
- "Qwen2 (151K)": {
29
- "avg_fertility": 8.030528860080679,
30
- "std_fertility": 13.75415784885323,
31
- "avg_compression": 3.8169528301673328,
32
- "fairness": 0.06777750450038225
33
  },
34
- "Sentinel-SUT": {
35
- "avg_fertility": 9.13065205232572,
36
- "std_fertility": 16.348063069521316,
37
- "avg_compression": 3.5456289797801976,
38
- "fairness": 0.057643322830483165
39
  }
40
- },
41
- "sentinel_constants": {
42
- "INV_E": 0.36787944117144233,
43
- "C1": -0.007994021805952546,
44
- "C2": 0.00020005604296784437
45
- },
46
- "training_data": {
47
- "languages": [
48
- "en",
49
- "fr",
50
- "de",
51
- "es",
52
- "zh",
53
- "ja",
54
- "ar",
55
- "ru",
56
- "ko",
57
- "hi",
58
- "pt",
59
- "it",
60
- "nl",
61
- "pl",
62
- "vi",
63
- "th",
64
- "tr",
65
- "he",
66
- "uk",
67
- "sv"
68
- ],
69
- "total_samples": 52000
70
  }
71
  }
 
1
  {
2
+ "summary": {
3
+ "Sentinel-v2": {
4
+ "compress": 4.3427,
5
+ "fertility": 10.5022,
6
+ "vocab": 94208,
7
+ "efficiency": 0.046097
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  },
9
+ "GPT-2": {
10
+ "compress": 2.4381,
11
+ "fertility": 28.8158,
12
+ "vocab": 50257,
13
+ "efficiency": 0.048513
14
  },
15
+ "Gemma": {
16
+ "compress": 5.3287,
17
+ "fertility": 8.348,
18
+ "vocab": 256000,
19
+ "efficiency": 0.020815
20
  },
21
+ "Qwen2": {
22
+ "compress": 4.3289,
23
+ "fertility": 10.4499,
24
+ "vocab": 151936,
25
+ "efficiency": 0.028491
26
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
28
  }
sentinel_manifold.json CHANGED
@@ -1,36 +1,51 @@
1
  {
 
2
  "framework": "Sentinel Manifold",
3
- "theorem": "Gradient Axiom: lim_{z\u2192\u221e} F'(z)/F(z) = 1/e",
4
- "function": "F(z) = \u03a3_{n=1}^\u221e z^n / n^n (Sophomore's Dream)",
 
 
 
 
 
 
 
 
5
  "constants": {
6
- "INV_E": {
7
- "value": 0.36787944117144233,
8
- "role": "Vocabulary allocation ratio / embedding gain"
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  },
10
- "C1": {
11
- "value": -0.007994021805952546,
12
- "role": "Attracting fixed point / quantization zero-point"
 
 
13
  },
14
- "C2": {
15
- "value": 0.00020005604296784437,
16
- "role": "Escape threshold / fertility fairness bound"
 
 
17
  }
18
  },
19
  "modality_architecture": {
20
- "text": "ByteLevel BPE (32K) with NFKC normalization, 20-language training",
21
- "image": "Discrete VQ codebook (16,384 tokens), Cosmos/VQGAN compatible",
22
- "audio": "Discrete VQ codebook (8,192 tokens), EnCodec/SoundStream compatible",
23
- "video": "Discrete VQ codebook (4,096 tokens), Cosmos-DV compatible"
24
- },
25
- "innovations": [
26
- "1/e-proportioned vocabulary allocation across modalities",
27
- "Native multimodal routing with zero-overhead modality switching",
28
- "Sentinel special tokens for manifold-aware computation",
29
- "20-language multilingual training for cross-lingual fairness",
30
- "Code + Math + Scientific notation native support",
31
- "Compatible with all HF transformers models"
32
- ],
33
- "version": "1.0.0",
34
- "license": "MIT",
35
- "author": "Romain Abdel-Aal (ASI The Sentinel V5.2)"
36
  }
 
1
  {
2
+ "version": "2.0.0",
3
  "framework": "Sentinel Manifold",
4
+ "theorem": "lim F'(z)/F(z) = 1/e",
5
+ "function": "F(z) = \u03a3 z^n/n^n",
6
+ "text_vocab": 65536,
7
+ "image_codebook": 16384,
8
+ "audio_codebook": 8192,
9
+ "video_codebook": 4096,
10
+ "total_vocab": 94208,
11
+ "training_languages": 30,
12
+ "training_samples": 287600,
13
+ "training_chars": 465942294,
14
  "constants": {
15
+ "INV_E": 0.36787944117144233,
16
+ "C1": -0.007994021805952546,
17
+ "C2": 0.00020005604296784437
18
+ },
19
+ "benchmark": {
20
+ "Sentinel-v2": {
21
+ "compress": 4.3427,
22
+ "fertility": 10.5022,
23
+ "vocab": 94208,
24
+ "efficiency": 0.046097
25
+ },
26
+ "GPT-2": {
27
+ "compress": 2.4381,
28
+ "fertility": 28.8158,
29
+ "vocab": 50257,
30
+ "efficiency": 0.048513
31
  },
32
+ "Gemma": {
33
+ "compress": 5.3287,
34
+ "fertility": 8.348,
35
+ "vocab": 256000,
36
+ "efficiency": 0.020815
37
  },
38
+ "Qwen2": {
39
+ "compress": 4.3289,
40
+ "fertility": 10.4499,
41
+ "vocab": 151936,
42
+ "efficiency": 0.028491
43
  }
44
  },
45
  "modality_architecture": {
46
+ "text": "ByteLevel BPE (65,536), NFKC, 30 languages",
47
+ "image": "VQ codebook (16,384), Cosmos/VQGAN/FSQ compatible",
48
+ "audio": "VQ codebook (8,192), EnCodec/SoundStream compatible",
49
+ "video": "VQ codebook (4,096), Cosmos-DV compatible"
50
+ }
 
 
 
 
 
 
 
 
 
 
 
51
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff