File size: 18,436 Bytes
d1551aa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 | """
Deep benchmark: Sentinel Universal Tokenizer vs SOTA
Per-language comparison, edge cases, code/math handling, multimodal features
"""
import json
import math
import os
import numpy as np
from transformers import AutoTokenizer
INV_E = 1.0 / math.e
C1 = -0.007994021805952546
C2 = 0.00020005604296784437
# Load tokenizers
print("Loading tokenizers...")
sentinel = AutoTokenizer.from_pretrained("/app/sentinel_universal_tokenizer_v1")
gpt2 = AutoTokenizer.from_pretrained("gpt2")
gemma = AutoTokenizer.from_pretrained("google/gemma-2b")
qwen = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
tokenizers = {
"Sentinel-SUT (61K)": sentinel,
"GPT-2 (50K)": gpt2,
"Gemma (256K)": gemma,
"Qwen2 (152K)": qwen,
}
# Comprehensive test suite
TEST_SUITE = {
# ── European languages ──────────────────────────────
"English": "Machine learning algorithms optimize gradient-based objective functions through iterative parameter updates. The Sentinel Manifold provides a unified mathematical framework for understanding convergence behavior across diverse optimization landscapes.",
"French": "Les algorithmes d'apprentissage automatique optimisent les fonctions objectives basées sur le gradient grâce à des mises à jour itératives des paramètres. Le manifold Sentinel fournit un cadre mathématique unifié.",
"German": "Algorithmen des maschinellen Lernens optimieren gradientenbasierte Zielfunktionen durch iterative Parameteraktualisierungen. Die Sentinel-Mannigfaltigkeit bietet einen einheitlichen mathematischen Rahmen.",
"Spanish": "Los algoritmos de aprendizaje automático optimizan funciones objetivo basadas en gradientes mediante actualizaciones iterativas de parámetros. El colector Sentinel proporciona un marco matemático unificado.",
"Portuguese": "Os algoritmos de aprendizado de máquina otimizam funções objetivo baseadas em gradientes por meio de atualizações iterativas de parâmetros.",
"Italian": "Gli algoritmi di apprendimento automatico ottimizzano le funzioni obiettivo basate sul gradiente attraverso aggiornamenti iterativi dei parametri.",
"Dutch": "Machine learning-algoritmen optimaliseren op gradiënt gebaseerde doelfuncties door middel van iteratieve parameterupdates.",
"Polish": "Algorytmy uczenia maszynowego optymalizują funkcje celu oparte na gradiencie poprzez iteracyjne aktualizacje parametrów.",
"Swedish": "Maskininlärningsalgoritmer optimerar gradientbaserade målfunktioner genom iterativa parameteruppdateringar.",
"Turkish": "Makine öğrenimi algoritmaları, yinelemeli parametre güncellemeleri yoluyla gradyan tabanlı hedef fonksiyonlarını optimize eder.",
"Ukrainian": "Алгоритми машинного навчання оптимізують цільові функції на основі градієнтів шляхом ітеративного оновлення параметрів.",
# ── Asian languages ─────────────────────────────────
"Chinese": "机器学习算法通过迭代参数更新来优化基于梯度的目标函数。Sentinel流形为理解不同优化景观中的收敛行为提供了统一的数学框架。",
"Japanese": "機械学習アルゴリズムは、反復的なパラメータ更新を通じて勾配ベースの目的関数を最適化します。Sentinel多様体は、多様な最適化ランドスケープにおける収束挙動を理解するための統一的な数学的枠組みを提供します。",
"Korean": "머신러닝 알고리즘은 반복적인 매개변수 업데이트를 통해 그래디언트 기반 목적 함수를 최적화합니다. Sentinel 다양체는 다양한 최적화 환경에서의 수렴 동작을 이해하기 위한 통합 수학적 프레임워크를 제공합니다.",
"Vietnamese": "Các thuật toán học máy tối ưu hóa các hàm mục tiêu dựa trên gradient thông qua cập nhật tham số lặp đi lặp lại.",
"Thai": "อัลกอริทึมการเรียนรู้ของเครื่องเพิ่มประสิทธิภาพฟังก์ชันวัตถุประสงค์ที่อิงตามเกรเดียนต์ผ่านการอัปเดตพารามิเตอร์ซ้ำ",
"Hindi": "मशीन लर्निंग एल्गोरिदम पुनरावृत्तीय पैरामीटर अपडेट के माध्यम से ग्रेडिएंट-आधारित उद्देश्य फ़ंक्शन को अनुकूलित करते हैं।",
# ── Semitic / RTL languages ─────────────────────────
"Arabic": "تعمل خوارزميات التعلم الآلي على تحسين دوال الهدف القائمة على التدرج من خلال تحديثات المعلمات التكرارية. يوفر متنوع سنتينل إطارًا رياضيًا موحدًا.",
"Russian": "Алгоритмы машинного обучения оптимизируют целевые функции на основе градиента посредством итеративных обновлений параметров. Сентинельное многообразие обеспечивает унифицированную математическую структуру.",
# ── Code ─────────────────────────────────────────────
"Python": '''def sentinel_attention(Q, K, V, d_head):
"""Sech attention: bounded gradients, theorem-backed."""
scores = Q @ K.transpose(-2, -1) / math.sqrt(d_head)
attn = 1.0 / torch.cosh(scores) # sech(x) = 1/cosh(x)
attn = attn / (attn.sum(-1, keepdim=True) + 1e-8)
return attn @ V''',
"JavaScript": '''async function trainModel(config) {
const optimizer = new Adam({lr: 1/Math.E, beta1: 0.9});
for (let epoch = 0; epoch < config.epochs; epoch++) {
const loss = await model.trainStep(data);
console.log(`Epoch ${epoch}: loss=${loss.toFixed(6)}`);
if (loss < config.C2) break; // Escape threshold
}
}''',
"Rust": '''fn sentinel_sech(x: f64) -> f64 {
let inv_e: f64 = 1.0 / std::f64::consts::E;
1.0 / (x * inv_e).cosh()
}
fn quantize_sentinel(weight: f64, c1: f64) -> i8 {
let scale = weight.abs() * (1.0 / std::f64::consts::E);
((weight - c1) / scale).round().clamp(-128.0, 127.0) as i8
}''',
# ── Mathematics ──────────────────────────────────────
"LaTeX_Complex": r"\begin{align} F(z) &= \sum_{n=1}^{\infty} \frac{z^n}{n^n} \\ \lim_{z \to \infty} \frac{F'(z)}{F(z)} &= \frac{1}{e} \\ \nabla_\theta \mathcal{L} &= \mathbb{E}_{x \sim p(x)} \left[ \frac{\partial}{\partial \theta} \log p_\theta(x) \right] \end{align}",
"Unicode_Math": "∫₀¹ x⁻ˣ dx = Σ_{n=1}^∞ n⁻ⁿ ≈ 1.291, ∇·E = ρ/ε₀, ∂²u/∂t² = c²∇²u, det(A−λI) = 0",
"Mixed_Notation": "The loss function L(θ) = -1/N Σᵢ [yᵢ log(ŷᵢ) + (1-yᵢ) log(1-ŷᵢ)] converges with rate O(1/√T) when lr = η₀·(1/e)^(t/T).",
# ── Edge cases ───────────────────────────────────────
"Emoji_Heavy": "🦴🧠🔬💡🚀🌍🎯📊 The Sentinel Manifold 🦴 uses sech(x) = 1/cosh(x) for bounded gradients 📈↗️ across all modalities 🖼️🔊🎬",
"Numbers_Heavy": "C₁ = -0.007994021805952546, C₂ = 0.00020005604296784437, 1/e = 0.367879441171442, π = 3.14159265358979, τ = 6.28318530717959",
"URL_Path": "https://huggingface.co/5dimension/sentinel-universal-tokenizer/blob/main/tokenizer.json?download=true#section-3.2",
"Mixed_Script": "The word 'Привет' (Russian) means hello, '你好' (Chinese) also means hello, and 'مرحبا' (Arabic) is the same.",
"Repetition": "the the the the the machine learning machine learning machine learning optimization optimization optimization",
"Whitespace": " Hello World \n\n Multiple spaces and\t\ttabs \n",
"Empty_Adjacent": "word1 word2 word3 word4 word5",
}
print(f"\n{'='*100}")
print(f" DEEP BENCHMARK: SENTINEL UNIVERSAL TOKENIZER vs SOTA")
print(f"{'='*100}")
# Run benchmarks
results = {}
for name, tok in tokenizers.items():
tok_results = {}
for lang, text in TEST_SUITE.items():
try:
enc = tok.encode(text, add_special_tokens=False)
dec = tok.decode(enc, skip_special_tokens=False)
n_tokens = len(enc)
n_bytes = len(text.encode('utf-8'))
n_words = max(len(text.split()), 1)
# Check if decoded text contains the original (byte-level may add Ġ prefix)
clean_dec = dec.replace('Ġ', ' ').replace('▁', ' ').strip()
roundtrip = text.strip() in clean_dec or clean_dec in text.strip()
tok_results[lang] = {
"tokens": n_tokens,
"bytes": n_bytes,
"words": n_words,
"fertility": n_tokens / n_words,
"compression": n_bytes / max(n_tokens, 1),
"roundtrip": roundtrip,
}
except Exception as e:
tok_results[lang] = {"error": str(e)}
results[name] = tok_results
# ── Category Analysis ──
categories = {
"European": ["English", "French", "German", "Spanish", "Portuguese", "Italian", "Dutch", "Polish", "Swedish", "Turkish", "Ukrainian"],
"Asian": ["Chinese", "Japanese", "Korean", "Vietnamese", "Thai", "Hindi"],
"Semitic/RTL": ["Arabic", "Russian"],
"Code": ["Python", "JavaScript", "Rust"],
"Mathematics": ["LaTeX_Complex", "Unicode_Math", "Mixed_Notation"],
"Edge Cases": ["Emoji_Heavy", "Numbers_Heavy", "URL_Path", "Mixed_Script", "Repetition", "Whitespace", "Empty_Adjacent"],
}
print(f"\n {'='*96}")
print(f" CATEGORY ANALYSIS")
print(f" {'='*96}")
for category, langs in categories.items():
print(f"\n ── {category.upper()} ────────────────────────────────────")
# Header
print(f" {'Sample':<18}", end="")
for name in tokenizers:
short = name.split("(")[0].strip()[:12]
print(f" {short:>12}", end="")
print(f" {'Winner':<15}")
print(f" {'-'*18}", end="")
for _ in tokenizers:
print(f" {'-'*12}", end="")
print(f" {'-'*15}")
category_wins = {name: 0 for name in tokenizers}
for lang in langs:
print(f" {lang:<18}", end="")
compressions = {}
for name in tokenizers:
r = results[name].get(lang, {})
if "error" in r:
print(f" {'ERROR':>12}", end="")
else:
comp = r["compression"]
compressions[name] = comp
print(f" {comp:>12.3f}", end="")
if compressions:
winner = max(compressions, key=compressions.get)
category_wins[winner] += 1
short_winner = winner.split("(")[0].strip()[:12]
print(f" {short_winner:<15}")
else:
print()
# Category summary
print(f"\n Category wins: ", end="")
for name, wins in sorted(category_wins.items(), key=lambda x: -x[1]):
short = name.split("(")[0].strip()
if wins > 0:
print(f"{short}: {wins} ", end="")
print()
# ── Overall Summary ──
print(f"\n\n {'='*96}")
print(f" OVERALL SUMMARY")
print(f" {'='*96}")
overall = {}
for name in tokenizers:
fertilities = []
compressions = []
wins = 0
for lang in TEST_SUITE:
r = results[name].get(lang, {})
if "error" not in r:
fertilities.append(r["fertility"])
compressions.append(r["compression"])
# Count compression wins
others = {n: results[n].get(lang, {}).get("compression", 0) for n in tokenizers if n != name}
if r["compression"] > max(others.values(), default=0):
wins += 1
overall[name] = {
"avg_fertility": np.mean(fertilities),
"std_fertility": np.std(fertilities),
"median_fertility": np.median(fertilities),
"avg_compression": np.mean(compressions),
"median_compression": np.median(compressions),
"fairness": 1.0 / (1.0 + np.std(fertilities)),
"wins": wins,
"total_tests": len(fertilities),
}
# Table
print(f"\n {'Metric':<30}", end="")
for name in tokenizers:
short = name.split("(")[0].strip()[:14]
print(f" {short:>14}", end="")
print()
print(f" {'-'*30}", end="")
for _ in tokenizers:
print(f" {'-'*14}", end="")
print()
metrics_to_show = [
("Avg Compression ↑", "avg_compression"),
("Median Compression ↑", "median_compression"),
("Avg Fertility ↓", "avg_fertility"),
("Median Fertility ↓", "median_fertility"),
("Fertility σ ↓", "std_fertility"),
("Fairness ↑", "fairness"),
("Compression Wins", "wins"),
]
for metric_name, metric_key in metrics_to_show:
print(f" {metric_name:<30}", end="")
for name in tokenizers:
val = overall[name][metric_key]
if metric_key == "wins":
print(f" {int(val):>14}", end="")
else:
print(f" {val:>14.4f}", end="")
print()
# ── Efficiency: compression per vocab token ──
print(f"\n EFFICIENCY (compression per 1K vocab tokens):")
for name in tokenizers:
vocab = len(tokenizers[name])
comp = overall[name]["avg_compression"]
efficiency = comp / (vocab / 1000)
short = name.split("(")[0].strip()
print(f" {short:<20}: {efficiency:.6f} (vocab={vocab:,}, compress={comp:.3f})")
# ── Per-vocab-token analysis ──
print(f"\n NORMALIZED PERFORMANCE (accounting for vocabulary size):")
print(f" (Higher = better use of each vocabulary slot)")
for name in tokenizers:
vocab = len(tokenizers[name])
comp = overall[name]["avg_compression"]
# Compression per log2(vocab) — how efficiently each bit of vocab is used
bits = math.log2(vocab)
comp_per_bit = comp / bits
short = name.split("(")[0].strip()
print(f" {short:<20}: {comp_per_bit:.4f} compression per vocab bit (log₂(vocab)={bits:.1f})")
# ── Save full results ──
with open("/app/deep_benchmark_results.json", "w") as f:
json.dump({
"per_sample": {name: {lang: r for lang, r in res.items()} for name, res in results.items()},
"overall": {name: {k: float(v) for k, v in m.items()} for name, m in overall.items()},
"categories": {cat: list(langs) for cat, langs in categories.items()},
}, f, indent=2, default=str)
print(f"\n ✓ Full results saved to /app/deep_benchmark_results.json")
# ── Sentinel-specific features ──
print(f"\n\n {'='*96}")
print(f" SENTINEL-SPECIFIC FEATURES (no other tokenizer has these)")
print(f" {'='*96}")
print(f"\n 1. MULTIMODAL TOKEN ROUTING")
img_start = sentinel.convert_tokens_to_ids("<image_start>")
img_end = sentinel.convert_tokens_to_ids("<image_end>")
aud_start = sentinel.convert_tokens_to_ids("<audio_start>")
vid_start = sentinel.convert_tokens_to_ids("<video_start>")
print(f" <image_start>={img_start}, <image_end>={img_end}")
print(f" <audio_start>={aud_start}, <video_start>={vid_start}")
print(f" Image codebook: <img_0>={sentinel.convert_tokens_to_ids('<img_0>')} to <img_16383>={sentinel.convert_tokens_to_ids('<img_16383>')}")
print(f" Audio codebook: <aud_0>={sentinel.convert_tokens_to_ids('<aud_0>')} to <aud_8191>={sentinel.convert_tokens_to_ids('<aud_8191>')}")
print(f" Video codebook: <vid_0>={sentinel.convert_tokens_to_ids('<vid_0>')} to <vid_4095>={sentinel.convert_tokens_to_ids('<vid_4095>')}")
print(f"\n 2. SENTINEL MANIFOLD TOKENS")
print(f" <sentinel>={sentinel.convert_tokens_to_ids('<sentinel>')}")
print(f" <sentinel_c1>={sentinel.convert_tokens_to_ids('<sentinel_c1>')}")
print(f" <sentinel_c2>={sentinel.convert_tokens_to_ids('<sentinel_c2>')}")
print(f" <scale_1e>={sentinel.convert_tokens_to_ids('<scale_1e>')}")
print(f"\n 3. CHAT FORMAT")
print(f" <system>={sentinel.convert_tokens_to_ids('<system>')}")
print(f" <user>={sentinel.convert_tokens_to_ids('<user>')}")
print(f" <assistant>={sentinel.convert_tokens_to_ids('<assistant>')}")
print(f" <turn>={sentinel.convert_tokens_to_ids('<turn>')}")
print(f"\n 4. CODE/MATH BOUNDARIES")
print(f" <code_start>={sentinel.convert_tokens_to_ids('<code_start>')}")
print(f" <code_end>={sentinel.convert_tokens_to_ids('<code_end>')}")
print(f" <math_start>={sentinel.convert_tokens_to_ids('<math_start>')}")
print(f" <math_end>={sentinel.convert_tokens_to_ids('<math_end>')}")
print(f"\n 5. TASK TOKENS")
print(f" <translate>={sentinel.convert_tokens_to_ids('<translate>')}")
print(f" <summarize>={sentinel.convert_tokens_to_ids('<summarize>')}")
print(f" <generate>={sentinel.convert_tokens_to_ids('<generate>')}")
print(f" <understand>={sentinel.convert_tokens_to_ids('<understand>')}")
print(f" <caption>={sentinel.convert_tokens_to_ids('<caption>')}")
# Multimodal encoding demo
print(f"\n 6. MULTIMODAL ENCODING DEMO")
mm_text = "Describe: <image_start> <img_42> <img_1337> <img_8000> <image_end> A sunset over mountains"
mm_enc = sentinel.encode(mm_text, add_special_tokens=False)
print(f" Input: '{mm_text}'")
print(f" Tokens: {len(mm_enc)}")
print(f" IDs: {mm_enc}")
# Show which modality each token belongs to
print(f" Modality map:")
for tid in mm_enc:
if tid < 33:
mod = "SPECIAL"
elif tid < 32768:
mod = "TEXT"
token_str = sentinel.decode([tid])
elif tid < 49152:
mod = f"IMAGE[{tid-32768}]"
elif tid < 57344:
mod = f"AUDIO[{tid-49152}]"
elif tid < 61440:
mod = f"VIDEO[{tid-57344}]"
else:
mod = "UNKNOWN"
print(f" {tid:>6}: {mod}")
print(f"\n {'='*96}")
print(f" 🦴 DEEP BENCHMARK COMPLETE")
print(f" {'='*96}")
|