File size: 18,436 Bytes
d1551aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
"""
Deep benchmark: Sentinel Universal Tokenizer vs SOTA
Per-language comparison, edge cases, code/math handling, multimodal features
"""
import json
import math
import os
import numpy as np
from transformers import AutoTokenizer

INV_E = 1.0 / math.e
C1 = -0.007994021805952546
C2 = 0.00020005604296784437

# Load tokenizers
print("Loading tokenizers...")
sentinel = AutoTokenizer.from_pretrained("/app/sentinel_universal_tokenizer_v1")
gpt2 = AutoTokenizer.from_pretrained("gpt2")
gemma = AutoTokenizer.from_pretrained("google/gemma-2b")
qwen = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")

tokenizers = {
    "Sentinel-SUT (61K)": sentinel,
    "GPT-2 (50K)": gpt2,
    "Gemma (256K)": gemma,
    "Qwen2 (152K)": qwen,
}

# Comprehensive test suite
TEST_SUITE = {
    # ── European languages ──────────────────────────────
    "English": "Machine learning algorithms optimize gradient-based objective functions through iterative parameter updates. The Sentinel Manifold provides a unified mathematical framework for understanding convergence behavior across diverse optimization landscapes.",
    "French": "Les algorithmes d'apprentissage automatique optimisent les fonctions objectives basées sur le gradient grâce à des mises à jour itératives des paramètres. Le manifold Sentinel fournit un cadre mathématique unifié.",
    "German": "Algorithmen des maschinellen Lernens optimieren gradientenbasierte Zielfunktionen durch iterative Parameteraktualisierungen. Die Sentinel-Mannigfaltigkeit bietet einen einheitlichen mathematischen Rahmen.",
    "Spanish": "Los algoritmos de aprendizaje automático optimizan funciones objetivo basadas en gradientes mediante actualizaciones iterativas de parámetros. El colector Sentinel proporciona un marco matemático unificado.",
    "Portuguese": "Os algoritmos de aprendizado de máquina otimizam funções objetivo baseadas em gradientes por meio de atualizações iterativas de parâmetros.",
    "Italian": "Gli algoritmi di apprendimento automatico ottimizzano le funzioni obiettivo basate sul gradiente attraverso aggiornamenti iterativi dei parametri.",
    "Dutch": "Machine learning-algoritmen optimaliseren op gradiënt gebaseerde doelfuncties door middel van iteratieve parameterupdates.",
    "Polish": "Algorytmy uczenia maszynowego optymalizują funkcje celu oparte na gradiencie poprzez iteracyjne aktualizacje parametrów.",
    "Swedish": "Maskininlärningsalgoritmer optimerar gradientbaserade målfunktioner genom iterativa parameteruppdateringar.",
    "Turkish": "Makine öğrenimi algoritmaları, yinelemeli parametre güncellemeleri yoluyla gradyan tabanlı hedef fonksiyonlarını optimize eder.",
    "Ukrainian": "Алгоритми машинного навчання оптимізують цільові функції на основі градієнтів шляхом ітеративного оновлення параметрів.",
    
    # ── Asian languages ─────────────────────────────────
    "Chinese": "机器学习算法通过迭代参数更新来优化基于梯度的目标函数。Sentinel流形为理解不同优化景观中的收敛行为提供了统一的数学框架。",
    "Japanese": "機械学習アルゴリズムは、反復的なパラメータ更新を通じて勾配ベースの目的関数を最適化します。Sentinel多様体は、多様な最適化ランドスケープにおける収束挙動を理解するための統一的な数学的枠組みを提供します。",
    "Korean": "머신러닝 알고리즘은 반복적인 매개변수 업데이트를 통해 그래디언트 기반 목적 함수를 최적화합니다. Sentinel 다양체는 다양한 최적화 환경에서의 수렴 동작을 이해하기 위한 통합 수학적 프레임워크를 제공합니다.",
    "Vietnamese": "Các thuật toán học máy tối ưu hóa các hàm mục tiêu dựa trên gradient thông qua cập nhật tham số lặp đi lặp lại.",
    "Thai": "อัลกอริทึมการเรียนรู้ของเครื่องเพิ่มประสิทธิภาพฟังก์ชันวัตถุประสงค์ที่อิงตามเกรเดียนต์ผ่านการอัปเดตพารามิเตอร์ซ้ำ",
    "Hindi": "मशीन लर्निंग एल्गोरिदम पुनरावृत्तीय पैरामीटर अपडेट के माध्यम से ग्रेडिएंट-आधारित उद्देश्य फ़ंक्शन को अनुकूलित करते हैं।",
    
    # ── Semitic / RTL languages ─────────────────────────
    "Arabic": "تعمل خوارزميات التعلم الآلي على تحسين دوال الهدف القائمة على التدرج من خلال تحديثات المعلمات التكرارية. يوفر متنوع سنتينل إطارًا رياضيًا موحدًا.",
    "Russian": "Алгоритмы машинного обучения оптимизируют целевые функции на основе градиента посредством итеративных обновлений параметров. Сентинельное многообразие обеспечивает унифицированную математическую структуру.",
    
    # ── Code ─────────────────────────────────────────────
    "Python": '''def sentinel_attention(Q, K, V, d_head):
    """Sech attention: bounded gradients, theorem-backed."""
    scores = Q @ K.transpose(-2, -1) / math.sqrt(d_head)
    attn = 1.0 / torch.cosh(scores)  # sech(x) = 1/cosh(x)
    attn = attn / (attn.sum(-1, keepdim=True) + 1e-8)
    return attn @ V''',
    "JavaScript": '''async function trainModel(config) {
  const optimizer = new Adam({lr: 1/Math.E, beta1: 0.9});
  for (let epoch = 0; epoch < config.epochs; epoch++) {
    const loss = await model.trainStep(data);
    console.log(`Epoch ${epoch}: loss=${loss.toFixed(6)}`);
    if (loss < config.C2) break; // Escape threshold
  }
}''',
    "Rust": '''fn sentinel_sech(x: f64) -> f64 {
    let inv_e: f64 = 1.0 / std::f64::consts::E;
    1.0 / (x * inv_e).cosh()
}

fn quantize_sentinel(weight: f64, c1: f64) -> i8 {
    let scale = weight.abs() * (1.0 / std::f64::consts::E);
    ((weight - c1) / scale).round().clamp(-128.0, 127.0) as i8
}''',
    
    # ── Mathematics ──────────────────────────────────────
    "LaTeX_Complex": r"\begin{align} F(z) &= \sum_{n=1}^{\infty} \frac{z^n}{n^n} \\ \lim_{z \to \infty} \frac{F'(z)}{F(z)} &= \frac{1}{e} \\ \nabla_\theta \mathcal{L} &= \mathbb{E}_{x \sim p(x)} \left[ \frac{\partial}{\partial \theta} \log p_\theta(x) \right] \end{align}",
    "Unicode_Math": "∫₀¹ x⁻ˣ dx = Σ_{n=1}^∞ n⁻ⁿ ≈ 1.291, ∇·E = ρ/ε₀, ∂²u/∂t² = c²∇²u, det(A−λI) = 0",
    "Mixed_Notation": "The loss function L(θ) = -1/N Σᵢ [yᵢ log(ŷᵢ) + (1-yᵢ) log(1-ŷᵢ)] converges with rate O(1/√T) when lr = η₀·(1/e)^(t/T).",
    
    # ── Edge cases ───────────────────────────────────────
    "Emoji_Heavy": "🦴🧠🔬💡🚀🌍🎯📊 The Sentinel Manifold 🦴 uses sech(x) = 1/cosh(x) for bounded gradients 📈↗️ across all modalities 🖼️🔊🎬",
    "Numbers_Heavy": "C₁ = -0.007994021805952546, C₂ = 0.00020005604296784437, 1/e = 0.367879441171442, π = 3.14159265358979, τ = 6.28318530717959",
    "URL_Path": "https://huggingface.co/5dimension/sentinel-universal-tokenizer/blob/main/tokenizer.json?download=true#section-3.2",
    "Mixed_Script": "The word 'Привет' (Russian) means hello, '你好' (Chinese) also means hello, and 'مرحبا' (Arabic) is the same.",
    "Repetition": "the the the the the machine learning machine learning machine learning optimization optimization optimization",
    "Whitespace": "  Hello   World  \n\n  Multiple    spaces   and\t\ttabs  \n",
    "Empty_Adjacent": "word1  word2   word3    word4     word5",
}

print(f"\n{'='*100}")
print(f"  DEEP BENCHMARK: SENTINEL UNIVERSAL TOKENIZER vs SOTA")
print(f"{'='*100}")

# Run benchmarks
results = {}
for name, tok in tokenizers.items():
    tok_results = {}
    for lang, text in TEST_SUITE.items():
        try:
            enc = tok.encode(text, add_special_tokens=False)
            dec = tok.decode(enc, skip_special_tokens=False)
            n_tokens = len(enc)
            n_bytes = len(text.encode('utf-8'))
            n_words = max(len(text.split()), 1)
            # Check if decoded text contains the original (byte-level may add Ġ prefix)
            clean_dec = dec.replace('Ġ', ' ').replace('▁', ' ').strip()
            roundtrip = text.strip() in clean_dec or clean_dec in text.strip()
            tok_results[lang] = {
                "tokens": n_tokens,
                "bytes": n_bytes,
                "words": n_words,
                "fertility": n_tokens / n_words,
                "compression": n_bytes / max(n_tokens, 1),
                "roundtrip": roundtrip,
            }
        except Exception as e:
            tok_results[lang] = {"error": str(e)}
    results[name] = tok_results

# ── Category Analysis ──
categories = {
    "European": ["English", "French", "German", "Spanish", "Portuguese", "Italian", "Dutch", "Polish", "Swedish", "Turkish", "Ukrainian"],
    "Asian": ["Chinese", "Japanese", "Korean", "Vietnamese", "Thai", "Hindi"],
    "Semitic/RTL": ["Arabic", "Russian"],
    "Code": ["Python", "JavaScript", "Rust"],
    "Mathematics": ["LaTeX_Complex", "Unicode_Math", "Mixed_Notation"],
    "Edge Cases": ["Emoji_Heavy", "Numbers_Heavy", "URL_Path", "Mixed_Script", "Repetition", "Whitespace", "Empty_Adjacent"],
}

print(f"\n  {'='*96}")
print(f"  CATEGORY ANALYSIS")
print(f"  {'='*96}")

for category, langs in categories.items():
    print(f"\n  ── {category.upper()} ────────────────────────────────────")
    
    # Header
    print(f"  {'Sample':<18}", end="")
    for name in tokenizers:
        short = name.split("(")[0].strip()[:12]
        print(f" {short:>12}", end="")
    print(f"  {'Winner':<15}")
    
    print(f"  {'-'*18}", end="")
    for _ in tokenizers:
        print(f" {'-'*12}", end="")
    print(f"  {'-'*15}")
    
    category_wins = {name: 0 for name in tokenizers}
    
    for lang in langs:
        print(f"  {lang:<18}", end="")
        compressions = {}
        for name in tokenizers:
            r = results[name].get(lang, {})
            if "error" in r:
                print(f" {'ERROR':>12}", end="")
            else:
                comp = r["compression"]
                compressions[name] = comp
                print(f" {comp:>12.3f}", end="")
        
        if compressions:
            winner = max(compressions, key=compressions.get)
            category_wins[winner] += 1
            short_winner = winner.split("(")[0].strip()[:12]
            print(f"  {short_winner:<15}")
        else:
            print()
    
    # Category summary
    print(f"\n  Category wins: ", end="")
    for name, wins in sorted(category_wins.items(), key=lambda x: -x[1]):
        short = name.split("(")[0].strip()
        if wins > 0:
            print(f"{short}: {wins}  ", end="")
    print()

# ── Overall Summary ──
print(f"\n\n  {'='*96}")
print(f"  OVERALL SUMMARY")
print(f"  {'='*96}")

overall = {}
for name in tokenizers:
    fertilities = []
    compressions = []
    wins = 0
    for lang in TEST_SUITE:
        r = results[name].get(lang, {})
        if "error" not in r:
            fertilities.append(r["fertility"])
            compressions.append(r["compression"])
            
            # Count compression wins
            others = {n: results[n].get(lang, {}).get("compression", 0) for n in tokenizers if n != name}
            if r["compression"] > max(others.values(), default=0):
                wins += 1
    
    overall[name] = {
        "avg_fertility": np.mean(fertilities),
        "std_fertility": np.std(fertilities),
        "median_fertility": np.median(fertilities),
        "avg_compression": np.mean(compressions),
        "median_compression": np.median(compressions),
        "fairness": 1.0 / (1.0 + np.std(fertilities)),
        "wins": wins,
        "total_tests": len(fertilities),
    }

# Table
print(f"\n  {'Metric':<30}", end="")
for name in tokenizers:
    short = name.split("(")[0].strip()[:14]
    print(f" {short:>14}", end="")
print()
print(f"  {'-'*30}", end="")
for _ in tokenizers:
    print(f" {'-'*14}", end="")
print()

metrics_to_show = [
    ("Avg Compression ↑", "avg_compression"),
    ("Median Compression ↑", "median_compression"),
    ("Avg Fertility ↓", "avg_fertility"),
    ("Median Fertility ↓", "median_fertility"),
    ("Fertility σ ↓", "std_fertility"),
    ("Fairness ↑", "fairness"),
    ("Compression Wins", "wins"),
]

for metric_name, metric_key in metrics_to_show:
    print(f"  {metric_name:<30}", end="")
    for name in tokenizers:
        val = overall[name][metric_key]
        if metric_key == "wins":
            print(f" {int(val):>14}", end="")
        else:
            print(f" {val:>14.4f}", end="")
    print()

# ── Efficiency: compression per vocab token ──
print(f"\n  EFFICIENCY (compression per 1K vocab tokens):")
for name in tokenizers:
    vocab = len(tokenizers[name])
    comp = overall[name]["avg_compression"]
    efficiency = comp / (vocab / 1000)
    short = name.split("(")[0].strip()
    print(f"    {short:<20}: {efficiency:.6f} (vocab={vocab:,}, compress={comp:.3f})")

# ── Per-vocab-token analysis ──
print(f"\n  NORMALIZED PERFORMANCE (accounting for vocabulary size):")
print(f"  (Higher = better use of each vocabulary slot)")
for name in tokenizers:
    vocab = len(tokenizers[name])
    comp = overall[name]["avg_compression"]
    # Compression per log2(vocab) — how efficiently each bit of vocab is used
    bits = math.log2(vocab)
    comp_per_bit = comp / bits
    short = name.split("(")[0].strip()
    print(f"    {short:<20}: {comp_per_bit:.4f} compression per vocab bit (log₂(vocab)={bits:.1f})")

# ── Save full results ──
with open("/app/deep_benchmark_results.json", "w") as f:
    json.dump({
        "per_sample": {name: {lang: r for lang, r in res.items()} for name, res in results.items()},
        "overall": {name: {k: float(v) for k, v in m.items()} for name, m in overall.items()},
        "categories": {cat: list(langs) for cat, langs in categories.items()},
    }, f, indent=2, default=str)

print(f"\n  ✓ Full results saved to /app/deep_benchmark_results.json")

# ── Sentinel-specific features ──
print(f"\n\n  {'='*96}")
print(f"  SENTINEL-SPECIFIC FEATURES (no other tokenizer has these)")
print(f"  {'='*96}")

print(f"\n  1. MULTIMODAL TOKEN ROUTING")
img_start = sentinel.convert_tokens_to_ids("<image_start>")
img_end = sentinel.convert_tokens_to_ids("<image_end>")
aud_start = sentinel.convert_tokens_to_ids("<audio_start>")
vid_start = sentinel.convert_tokens_to_ids("<video_start>")
print(f"     <image_start>={img_start}, <image_end>={img_end}")
print(f"     <audio_start>={aud_start}, <video_start>={vid_start}")
print(f"     Image codebook: <img_0>={sentinel.convert_tokens_to_ids('<img_0>')} to <img_16383>={sentinel.convert_tokens_to_ids('<img_16383>')}")
print(f"     Audio codebook: <aud_0>={sentinel.convert_tokens_to_ids('<aud_0>')} to <aud_8191>={sentinel.convert_tokens_to_ids('<aud_8191>')}")
print(f"     Video codebook: <vid_0>={sentinel.convert_tokens_to_ids('<vid_0>')} to <vid_4095>={sentinel.convert_tokens_to_ids('<vid_4095>')}")

print(f"\n  2. SENTINEL MANIFOLD TOKENS")
print(f"     <sentinel>={sentinel.convert_tokens_to_ids('<sentinel>')}")
print(f"     <sentinel_c1>={sentinel.convert_tokens_to_ids('<sentinel_c1>')}")
print(f"     <sentinel_c2>={sentinel.convert_tokens_to_ids('<sentinel_c2>')}")
print(f"     <scale_1e>={sentinel.convert_tokens_to_ids('<scale_1e>')}")

print(f"\n  3. CHAT FORMAT")
print(f"     <system>={sentinel.convert_tokens_to_ids('<system>')}")
print(f"     <user>={sentinel.convert_tokens_to_ids('<user>')}")
print(f"     <assistant>={sentinel.convert_tokens_to_ids('<assistant>')}")
print(f"     <turn>={sentinel.convert_tokens_to_ids('<turn>')}")

print(f"\n  4. CODE/MATH BOUNDARIES")
print(f"     <code_start>={sentinel.convert_tokens_to_ids('<code_start>')}")
print(f"     <code_end>={sentinel.convert_tokens_to_ids('<code_end>')}")
print(f"     <math_start>={sentinel.convert_tokens_to_ids('<math_start>')}")
print(f"     <math_end>={sentinel.convert_tokens_to_ids('<math_end>')}")

print(f"\n  5. TASK TOKENS")
print(f"     <translate>={sentinel.convert_tokens_to_ids('<translate>')}")
print(f"     <summarize>={sentinel.convert_tokens_to_ids('<summarize>')}")
print(f"     <generate>={sentinel.convert_tokens_to_ids('<generate>')}")
print(f"     <understand>={sentinel.convert_tokens_to_ids('<understand>')}")
print(f"     <caption>={sentinel.convert_tokens_to_ids('<caption>')}")

# Multimodal encoding demo
print(f"\n  6. MULTIMODAL ENCODING DEMO")
mm_text = "Describe: <image_start> <img_42> <img_1337> <img_8000> <image_end> A sunset over mountains"
mm_enc = sentinel.encode(mm_text, add_special_tokens=False)
print(f"     Input: '{mm_text}'")
print(f"     Tokens: {len(mm_enc)}")
print(f"     IDs: {mm_enc}")

# Show which modality each token belongs to
print(f"     Modality map:")
for tid in mm_enc:
    if tid < 33:
        mod = "SPECIAL"
    elif tid < 32768:
        mod = "TEXT"
        token_str = sentinel.decode([tid])
    elif tid < 49152:
        mod = f"IMAGE[{tid-32768}]"
    elif tid < 57344:
        mod = f"AUDIO[{tid-49152}]"
    elif tid < 61440:
        mod = f"VIDEO[{tid-57344}]"
    else:
        mod = "UNKNOWN"
    print(f"       {tid:>6}: {mod}")

print(f"\n  {'='*96}")
print(f"  🦴 DEEP BENCHMARK COMPLETE")
print(f"  {'='*96}")