"""
🦴 Sentinel Universal Tokenizer — Interactive Demo Space
Multimodal tokenizer grounded in the Gradient Axiom: lim F'(z)/F(z) = 1/e
"""
import math
import gradio as gr
from transformers import AutoTokenizer
import colorsys
# ──────────────────────────────────────────────────────────────────────────────
# Constants
# ──────────────────────────────────────────────────────────────────────────────
INV_E = 1.0 / math.e
C1 = -0.007994021805952546
C2 = 0.00020005604296784437
# ──────────────────────────────────────────────────────────────────────────────
# Load tokenizers
# ──────────────────────────────────────────────────────────────────────────────
print("Loading Sentinel Universal Tokenizer...")
sentinel_tok = AutoTokenizer.from_pretrained("5dimension/sentinel-universal-tokenizer")
print(f" ✓ Sentinel loaded: {len(sentinel_tok):,} tokens")
print("Loading baseline tokenizers...")
baselines = {}
for name, model_id in [
("GPT-2 (50K)", "gpt2"),
("Gemma (256K)", "google/gemma-2b"),
("Qwen2 (152K)", "Qwen/Qwen2-0.5B"),
]:
try:
baselines[name] = AutoTokenizer.from_pretrained(model_id)
print(f" ✓ {name} loaded")
except Exception as e:
print(f" ⚠ {name}: {e}")
def get_modality_color(token_id):
if token_id < 33:
return "#ff6b6b"
elif token_id < 32768:
return None
elif token_id < 49152:
return "#4ecdc4"
elif token_id < 57344:
return "#45b7d1"
elif token_id < 61440:
return "#96ceb4"
return "#95a5a6"
def token_to_color(idx, total):
hue = (idx * 0.618033988749895) % 1.0
sat = 0.35 + 0.15 * (idx % 3)
val = 0.92
r, g, b = colorsys.hsv_to_rgb(hue, sat, val)
return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
def make_token_html(tokens, token_ids):
parts = []
for i, (tok, tid) in enumerate(zip(tokens, token_ids)):
mod_color = get_modality_color(tid)
color = mod_color if mod_color else token_to_color(i, len(tokens))
if tid < 33:
mod = "special"
elif tid < 32768:
mod = "text"
elif tid < 49152:
mod = f"img[{tid-32768}]"
elif tid < 57344:
mod = f"aud[{tid-49152}]"
elif tid < 61440:
mod = f"vid[{tid-57344}]"
else:
mod = "?"
display = tok.replace("<", "<").replace(">", ">").replace(" ", "·").replace("\n", "↵")
if not display.strip():
display = "·"
parts.append(
f''
f'{display}'
)
return "".join(parts)
def tokenize_and_analyze(text):
if not text or not text.strip():
return ("Enter some text to tokenize.", "", "", "", "")
enc = sentinel_tok.encode(text, add_special_tokens=False)
tokens = sentinel_tok.convert_ids_to_tokens(enc)
n_tokens = len(enc)
n_bytes = len(text.encode("utf-8"))
n_words = max(len(text.split()), 1)
compression = n_bytes / max(n_tokens, 1)
fertility = n_tokens / n_words
vis_html = '
' + make_token_html(tokens, enc) + '
'
stats_md = f"""### 📈 Stats
| Metric | Value |
|:-------|------:|
| **Tokens** | **{n_tokens}** |
| Bytes | {n_bytes} |
| Words | {n_words} |
| **Compression** | **{compression:.3f}** bytes/token |
| Fertility | {fertility:.3f} tokens/word |
"""
rows = [f"| **Sentinel-SUT** | **{len(sentinel_tok):,}** | **{n_tokens}** | **{compression:.3f}** | **{fertility:.3f}** |"]
for bname, btok in baselines.items():
try:
benc = btok.encode(text, add_special_tokens=False)
bn = len(benc)
bcomp = n_bytes / max(bn, 1)
bfert = bn / n_words
rows.append(f"| {bname} | {len(btok):,} | {bn} | {bcomp:.3f} | {bfert:.3f} |")
except:
rows.append(f"| {bname} | — | — | — | — |")
compare_md = "### ⚔️ vs SOTA\n| Tokenizer | Vocab | Tokens | Compress↑ | Fertility↓ |\n|:----------|------:|-------:|----------:|-----------:|\n" + "\n".join(rows)
ids_text = ", ".join(str(x) for x in enc[:150])
if len(enc) > 150:
ids_text += f" … +{len(enc)-150} more"
ids_md = f"**Token IDs** ({n_tokens}):\n```\n{ids_text}\n```"
mod_counts = {"special": 0, "text": 0, "image": 0, "audio": 0, "video": 0}
for tid in enc:
if tid < 33: mod_counts["special"] += 1
elif tid < 32768: mod_counts["text"] += 1
elif tid < 49152: mod_counts["image"] += 1
elif tid < 57344: mod_counts["audio"] += 1
elif tid < 61440: mod_counts["video"] += 1
mod_md = "### 🌐 Modality Breakdown\n"
emojis = {"special": "⚙️", "text": "📝", "image": "🖼️", "audio": "🔊", "video": "🎬"}
for mod, count in mod_counts.items():
if count > 0:
pct = count / n_tokens * 100
bar = "█" * max(1, int(pct / 2))
mod_md += f"{emojis.get(mod,'')} **{mod}**: {count} ({pct:.1f}%) `{bar}`\n\n"
return vis_html, stats_md, compare_md, ids_md, mod_md
def decode_ids(ids_text):
try:
ids = [int(x.strip()) for x in ids_text.split(",") if x.strip().lstrip('-').isdigit()]
decoded = sentinel_tok.decode(ids, skip_special_tokens=False)
tokens = sentinel_tok.convert_ids_to_tokens(ids)
vis_html = '' + make_token_html(tokens, ids) + '
'
return decoded, vis_html
except Exception as e:
return f"Error: {e}", ""
def run_multilingual_benchmark():
samples = {
"🇬🇧 English": "Machine learning transforms data into intelligence through gradient optimization.",
"🇫🇷 French": "L'apprentissage automatique transforme les données en intelligence grâce à l'optimisation.",
"🇩🇪 German": "Maschinelles Lernen verwandelt Daten in Intelligenz durch mathematische Optimierung.",
"🇪🇸 Spanish": "El aprendizaje automático transforma datos en inteligencia mediante optimización matemática.",
"🇨🇳 Chinese": "机器学习通过数学优化将数据转化为智能。深度学习模型使用梯度下降来最小化损失函数。",
"🇯🇵 Japanese": "機械学習はデータを知性に変換します。深層学習モデルは勾配降下法を使用します。",
"🇸🇦 Arabic": "التعلم الآلي يحول البيانات إلى ذكاء من خلال التحسين الرياضي للخوارزميات.",
"🇷🇺 Russian": "Машинное обучение преобразует данные в интеллект посредством математической оптимизации.",
"🇰🇷 Korean": "머신러닝은 수학적 최적화를 통해 데이터를 지능으로 변환합니다.",
"🇮🇳 Hindi": "मशीन लर्निंग गणितीय अनुकूलन के माध्यम से डेटा को बुद्धिमत्ता में बदलती है।",
"🇵🇹 Portuguese": "O aprendizado de máquina transforma dados em inteligência por meio da otimização.",
"🇻🇳 Vietnamese": "Học máy chuyển đổi dữ liệu thành trí tuệ thông qua tối ưu hóa toán học.",
"🇹🇭 Thai": "การเรียนรู้ของเครื่องเปลี่ยนข้อมูลเป็นปัญญาผ่านการเพิ่มประสิทธิภาพทางคณิตศาสตร์",
"🐍 Python": "def sech(x): return 1.0 / math.cosh(x * (1/math.e))",
"📐 Math": "∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.291, ∇f = (∂f/∂x₁, ∂f/∂x₂)",
}
all_toks = {"**Sentinel**": sentinel_tok, **baselines}
header = "| Language | Text |"
sep = "|:---------|:-----|"
for tname in all_toks:
short = tname.replace("**","").split("(")[0].strip()[:10]
header += f" {short} |"
sep += " ---: |"
rows = [header, sep]
for lang, text in samples.items():
n_bytes = len(text.encode("utf-8"))
row = f"| {lang} | {text[:35]}… |"
for tname, tok in all_toks.items():
try:
enc = tok.encode(text, add_special_tokens=False)
comp = n_bytes / max(len(enc), 1)
row += f" **{comp:.2f}** |" if "Sentinel" in tname else f" {comp:.2f} |"
except:
row += " — |"
rows.append(row)
return "\n".join(rows)
EXAMPLES = [
"The Sentinel Manifold: F(z) = Σ zⁿ/nⁿ, where lim F'(z)/F(z) = 1/e ≈ 0.3679. This unified mathematical framework powers optimization, attention, quantization, and generation.",
"机器学习通过数学优化将数据转化为智能。Sentinel多样体提供了统一的数学框架。",
"def sentinel_attention(Q, K, V):\n scores = Q @ K.T / math.sqrt(d)\n attn = 1.0 / torch.cosh(scores)\n return (attn / attn.sum(-1, keepdim=True)) @ V",
"Describe: Listen: ",
"You are a multimodal AI.What is 1/e?1/e ≈ 0.3679, the Gradient Axiom limit.",
"∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.29128, ∇·E = ρ/ε₀, det(A−λI) = 0",
"الثعلب البني السريع يقفز فوق الكلب الكسول. التعلم الآلي يحول البيانات إلى ذكاء.",
"🦴🧠🔬💡🚀 Sentinel uses sech(x) = 1/cosh(x) for bounded gradients 📈 across modalities 🖼️🔊🎬",
]
with gr.Blocks(
title="🦴 Sentinel Universal Tokenizer",
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
) as demo:
gr.Markdown("""
# 🦴 Sentinel Universal Tokenizer
**One theorem. Every modality. One vocabulary.**
A **61,440-token** multimodal tokenizer for **text + image + audio + video**,
grounded in the Gradient Axiom: `lim F'(z)/F(z) = 1/e`
| Constant | Value | Role |
|:---------|:------|:-----|
| **1/e** | 0.36788 | Vocab allocation ratio |
| **C₁** | −0.00799 | Quantization zero-point |
| **C₂** | 0.00020 | Fairness bound |
""")
with gr.Tabs():
with gr.Tab("🔤 Tokenize"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(label="Input Text", lines=5, value=EXAMPLES[0], placeholder="Enter text in any language, code, math, or multimodal format...")
tokenize_btn = gr.Button("🦴 Tokenize", variant="primary", size="lg")
gr.Examples(examples=EXAMPLES, inputs=text_input, label="Try these:")
with gr.Column(scale=3):
token_vis = gr.HTML(label="Token Visualization")
with gr.Row():
stats_out = gr.Markdown()
compare_out = gr.Markdown()
with gr.Row():
ids_out = gr.Markdown()
mod_out = gr.Markdown()
tokenize_btn.click(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out])
with gr.Tab("🔙 Decode"):
gr.Markdown("### Decode Token IDs → Text")
ids_input = gr.Textbox(label="Token IDs (comma-separated)", placeholder="72, 4153, 33, 3004, 592", lines=2)
decode_btn = gr.Button("Decode", variant="primary")
decoded_text = gr.Textbox(label="Decoded Text", lines=3)
decoded_vis = gr.HTML(label="Tokens")
decode_btn.click(decode_ids, inputs=ids_input, outputs=[decoded_text, decoded_vis])
with gr.Tab("📊 Benchmark"):
gr.Markdown("### Multilingual Compression Benchmark\nCompression ratio (bytes/token). **Higher = better.**")
bench_btn = gr.Button("Run Benchmark", variant="primary")
bench_out = gr.Markdown()
bench_btn.click(run_multilingual_benchmark, outputs=bench_out)
with gr.Tab("📖 Vocabulary"):
gr.Markdown(f"""### Architecture
```
┌──────────────────────────────────────────────────┐
│ SENTINEL UNIVERSAL TOKENIZER (61,440 tokens) │
│ │
│ [0–32] → 33 Special/Control tokens │
│ [33–32,767] → 32,735 ByteLevel BPE (text) │
│ [32,768–49,151] → 16,384 Image codebook (VQ) │
│ [49,152–57,343] → 8,192 Audio codebook (VQ) │
│ [57,344–61,439] → 4,096 Video codebook (VQ) │
│ │
│ Follows 1/e Gradient Axiom scaling │
└──────────────────────────────────────────────────┘
```
**Total**: {len(sentinel_tok):,} tokens | **Text**: 32K | **Image**: 16K | **Audio**: 8K | **Video**: 4K
""")
specials_md = "### Special Tokens\n| Token | ID | Purpose |\n|:------|---:|:--------|\n"
for tok_name, purpose in [
("","Padding"), ("","Unknown"), ("","BOS"), ("","EOS"), ("","MLM"),
("","Image start"), ("","Image end"), ("","Image placeholder"),
("","Audio start"), ("","Audio end"), ("