5dimension's picture
Add Gradio app for Sentinel Universal Tokenizer demo
8c14443 verified
"""
๐Ÿฆด Sentinel Universal Tokenizer โ€” Interactive Demo Space
Multimodal tokenizer grounded in the Gradient Axiom: lim F'(z)/F(z) = 1/e
"""
import math
import gradio as gr
from transformers import AutoTokenizer
import colorsys
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Constants
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
INV_E = 1.0 / math.e
C1 = -0.007994021805952546
C2 = 0.00020005604296784437
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# Load tokenizers
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
print("Loading Sentinel Universal Tokenizer...")
sentinel_tok = AutoTokenizer.from_pretrained("5dimension/sentinel-universal-tokenizer")
print(f" โœ“ Sentinel loaded: {len(sentinel_tok):,} tokens")
print("Loading baseline tokenizers...")
baselines = {}
for name, model_id in [
("GPT-2 (50K)", "gpt2"),
("Gemma (256K)", "google/gemma-2b"),
("Qwen2 (152K)", "Qwen/Qwen2-0.5B"),
]:
try:
baselines[name] = AutoTokenizer.from_pretrained(model_id)
print(f" โœ“ {name} loaded")
except Exception as e:
print(f" โš  {name}: {e}")
def get_modality_color(token_id):
if token_id < 33:
return "#ff6b6b"
elif token_id < 32768:
return None
elif token_id < 49152:
return "#4ecdc4"
elif token_id < 57344:
return "#45b7d1"
elif token_id < 61440:
return "#96ceb4"
return "#95a5a6"
def token_to_color(idx, total):
hue = (idx * 0.618033988749895) % 1.0
sat = 0.35 + 0.15 * (idx % 3)
val = 0.92
r, g, b = colorsys.hsv_to_rgb(hue, sat, val)
return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
def make_token_html(tokens, token_ids):
parts = []
for i, (tok, tid) in enumerate(zip(tokens, token_ids)):
mod_color = get_modality_color(tid)
color = mod_color if mod_color else token_to_color(i, len(tokens))
if tid < 33:
mod = "special"
elif tid < 32768:
mod = "text"
elif tid < 49152:
mod = f"img[{tid-32768}]"
elif tid < 57344:
mod = f"aud[{tid-49152}]"
elif tid < 61440:
mod = f"vid[{tid-57344}]"
else:
mod = "?"
display = tok.replace("<", "&lt;").replace(">", "&gt;").replace(" ", "ยท").replace("\n", "โ†ต")
if not display.strip():
display = "ยท"
parts.append(
f'<span title="ID={tid} | {mod}" '
f'style="background:{color}; padding:2px 5px; margin:1px; '
f'border-radius:4px; display:inline-block; font-family:monospace; '
f'font-size:13px; cursor:pointer; border:1px solid rgba(0,0,0,0.1);">'
f'{display}</span>'
)
return "".join(parts)
def tokenize_and_analyze(text):
if not text or not text.strip():
return ("Enter some text to tokenize.", "", "", "", "")
enc = sentinel_tok.encode(text, add_special_tokens=False)
tokens = sentinel_tok.convert_ids_to_tokens(enc)
n_tokens = len(enc)
n_bytes = len(text.encode("utf-8"))
n_words = max(len(text.split()), 1)
compression = n_bytes / max(n_tokens, 1)
fertility = n_tokens / n_words
vis_html = '<div style="line-height:2.2; padding:10px; background:#f8f9fa; border-radius:8px;">' + make_token_html(tokens, enc) + '</div>'
stats_md = f"""### ๐Ÿ“ˆ Stats
| Metric | Value |
|:-------|------:|
| **Tokens** | **{n_tokens}** |
| Bytes | {n_bytes} |
| Words | {n_words} |
| **Compression** | **{compression:.3f}** bytes/token |
| Fertility | {fertility:.3f} tokens/word |
"""
rows = [f"| **Sentinel-SUT** | **{len(sentinel_tok):,}** | **{n_tokens}** | **{compression:.3f}** | **{fertility:.3f}** |"]
for bname, btok in baselines.items():
try:
benc = btok.encode(text, add_special_tokens=False)
bn = len(benc)
bcomp = n_bytes / max(bn, 1)
bfert = bn / n_words
rows.append(f"| {bname} | {len(btok):,} | {bn} | {bcomp:.3f} | {bfert:.3f} |")
except:
rows.append(f"| {bname} | โ€” | โ€” | โ€” | โ€” |")
compare_md = "### โš”๏ธ vs SOTA\n| Tokenizer | Vocab | Tokens | Compressโ†‘ | Fertilityโ†“ |\n|:----------|------:|-------:|----------:|-----------:|\n" + "\n".join(rows)
ids_text = ", ".join(str(x) for x in enc[:150])
if len(enc) > 150:
ids_text += f" โ€ฆ +{len(enc)-150} more"
ids_md = f"**Token IDs** ({n_tokens}):\n```\n{ids_text}\n```"
mod_counts = {"special": 0, "text": 0, "image": 0, "audio": 0, "video": 0}
for tid in enc:
if tid < 33: mod_counts["special"] += 1
elif tid < 32768: mod_counts["text"] += 1
elif tid < 49152: mod_counts["image"] += 1
elif tid < 57344: mod_counts["audio"] += 1
elif tid < 61440: mod_counts["video"] += 1
mod_md = "### ๐ŸŒ Modality Breakdown\n"
emojis = {"special": "โš™๏ธ", "text": "๐Ÿ“", "image": "๐Ÿ–ผ๏ธ", "audio": "๐Ÿ”Š", "video": "๐ŸŽฌ"}
for mod, count in mod_counts.items():
if count > 0:
pct = count / n_tokens * 100
bar = "โ–ˆ" * max(1, int(pct / 2))
mod_md += f"{emojis.get(mod,'')} **{mod}**: {count} ({pct:.1f}%) `{bar}`\n\n"
return vis_html, stats_md, compare_md, ids_md, mod_md
def decode_ids(ids_text):
try:
ids = [int(x.strip()) for x in ids_text.split(",") if x.strip().lstrip('-').isdigit()]
decoded = sentinel_tok.decode(ids, skip_special_tokens=False)
tokens = sentinel_tok.convert_ids_to_tokens(ids)
vis_html = '<div style="line-height:2.2; padding:10px; background:#f8f9fa; border-radius:8px;">' + make_token_html(tokens, ids) + '</div>'
return decoded, vis_html
except Exception as e:
return f"Error: {e}", ""
def run_multilingual_benchmark():
samples = {
"๐Ÿ‡ฌ๐Ÿ‡ง English": "Machine learning transforms data into intelligence through gradient optimization.",
"๐Ÿ‡ซ๐Ÿ‡ท French": "L'apprentissage automatique transforme les donnรฉes en intelligence grรขce ร  l'optimisation.",
"๐Ÿ‡ฉ๐Ÿ‡ช German": "Maschinelles Lernen verwandelt Daten in Intelligenz durch mathematische Optimierung.",
"๐Ÿ‡ช๐Ÿ‡ธ Spanish": "El aprendizaje automรกtico transforma datos en inteligencia mediante optimizaciรณn matemรกtica.",
"๐Ÿ‡จ๐Ÿ‡ณ Chinese": "ๆœบๅ™จๅญฆไน ้€š่ฟ‡ๆ•ฐๅญฆไผ˜ๅŒ–ๅฐ†ๆ•ฐๆฎ่ฝฌๅŒ–ไธบๆ™บ่ƒฝใ€‚ๆทฑๅบฆๅญฆไน ๆจกๅž‹ไฝฟ็”จๆขฏๅบฆไธ‹้™ๆฅๆœ€ๅฐๅŒ–ๆŸๅคฑๅ‡ฝๆ•ฐใ€‚",
"๐Ÿ‡ฏ๐Ÿ‡ต Japanese": "ๆฉŸๆขฐๅญฆ็ฟ’ใฏใƒ‡ใƒผใ‚ฟใ‚’็Ÿฅๆ€งใซๅค‰ๆ›ใ—ใพใ™ใ€‚ๆทฑๅฑคๅญฆ็ฟ’ใƒขใƒ‡ใƒซใฏๅ‹พ้…้™ไธ‹ๆณ•ใ‚’ไฝฟ็”จใ—ใพใ™ใ€‚",
"๐Ÿ‡ธ๐Ÿ‡ฆ Arabic": "ุงู„ุชุนู„ู… ุงู„ุขู„ูŠ ูŠุญูˆู„ ุงู„ุจูŠุงู†ุงุช ุฅู„ู‰ ุฐูƒุงุก ู…ู† ุฎู„ุงู„ ุงู„ุชุญุณูŠู† ุงู„ุฑูŠุงุถูŠ ู„ู„ุฎูˆุงุฑุฒู…ูŠุงุช.",
"๐Ÿ‡ท๐Ÿ‡บ Russian": "ะœะฐัˆะธะฝะฝะพะต ะพะฑัƒั‡ะตะฝะธะต ะฟั€ะตะพะฑั€ะฐะทัƒะตั‚ ะดะฐะฝะฝั‹ะต ะฒ ะธะฝั‚ะตะปะปะตะบั‚ ะฟะพัั€ะตะดัั‚ะฒะพะผ ะผะฐั‚ะตะผะฐั‚ะธั‡ะตัะบะพะน ะพะฟั‚ะธะผะธะทะฐั†ะธะธ.",
"๐Ÿ‡ฐ๐Ÿ‡ท Korean": "๋จธ์‹ ๋Ÿฌ๋‹์€ ์ˆ˜ํ•™์  ์ตœ์ ํ™”๋ฅผ ํ†ตํ•ด ๋ฐ์ดํ„ฐ๋ฅผ ์ง€๋Šฅ์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.",
"๐Ÿ‡ฎ๐Ÿ‡ณ Hindi": "เคฎเคถเฅ€เคจ เคฒเคฐเฅเคจเคฟเค‚เค— เค—เคฃเคฟเคคเฅ€เคฏ เค…เคจเฅเค•เฅ‚เคฒเคจ เค•เฅ‡ เคฎเคพเคงเฅเคฏเคฎ เคธเฅ‡ เคกเฅ‡เคŸเคพ เค•เฅ‹ เคฌเฅเคฆเฅเคงเคฟเคฎเคคเฅเคคเคพ เคฎเฅ‡เค‚ เคฌเคฆเคฒเคคเฅ€ เคนเฅˆเฅค",
"๐Ÿ‡ต๐Ÿ‡น Portuguese": "O aprendizado de mรกquina transforma dados em inteligรชncia por meio da otimizaรงรฃo.",
"๐Ÿ‡ป๐Ÿ‡ณ Vietnamese": "Hแปc mรกy chuyแปƒn ฤ‘แป•i dแปฏ liแป‡u thร nh trรญ tuแป‡ thรดng qua tแป‘i ฦฐu hรณa toรกn hแปc.",
"๐Ÿ‡น๐Ÿ‡ญ Thai": "เธเธฒเธฃเน€เธฃเธตเธขเธ™เธฃเธนเน‰เธ‚เธญเธ‡เน€เธ„เธฃเธทเนˆเธญเธ‡เน€เธ›เธฅเธตเนˆเธขเธ™เธ‚เน‰เธญเธกเธนเธฅเน€เธ›เน‡เธ™เธ›เธฑเธเธเธฒเธœเนˆเธฒเธ™เธเธฒเธฃเน€เธžเธดเนˆเธกเธ›เธฃเธฐเธชเธดเธ—เธ˜เธดเธ เธฒเธžเธ—เธฒเธ‡เธ„เธ“เธดเธ•เธจเธฒเธชเธ•เธฃเนŒ",
"๐Ÿ Python": "def sech(x): return 1.0 / math.cosh(x * (1/math.e))",
"๐Ÿ“ Math": "โˆซโ‚€ยน xโปหฃ dx = ฮฃ nโปโฟ โ‰ˆ 1.291, โˆ‡f = (โˆ‚f/โˆ‚xโ‚, โˆ‚f/โˆ‚xโ‚‚)",
}
all_toks = {"**Sentinel**": sentinel_tok, **baselines}
header = "| Language | Text |"
sep = "|:---------|:-----|"
for tname in all_toks:
short = tname.replace("**","").split("(")[0].strip()[:10]
header += f" {short} |"
sep += " ---: |"
rows = [header, sep]
for lang, text in samples.items():
n_bytes = len(text.encode("utf-8"))
row = f"| {lang} | {text[:35]}โ€ฆ |"
for tname, tok in all_toks.items():
try:
enc = tok.encode(text, add_special_tokens=False)
comp = n_bytes / max(len(enc), 1)
row += f" **{comp:.2f}** |" if "Sentinel" in tname else f" {comp:.2f} |"
except:
row += " โ€” |"
rows.append(row)
return "\n".join(rows)
EXAMPLES = [
"The Sentinel Manifold: F(z) = ฮฃ zโฟ/nโฟ, where lim F'(z)/F(z) = 1/e โ‰ˆ 0.3679. This unified mathematical framework powers optimization, attention, quantization, and generation.",
"ๆœบๅ™จๅญฆไน ้€š่ฟ‡ๆ•ฐๅญฆไผ˜ๅŒ–ๅฐ†ๆ•ฐๆฎ่ฝฌๅŒ–ไธบๆ™บ่ƒฝใ€‚Sentinelๅคšๆ ทไฝ“ๆไพ›ไบ†็ปŸไธ€็š„ๆ•ฐๅญฆๆก†ๆžถใ€‚",
"def sentinel_attention(Q, K, V):\n scores = Q @ K.T / math.sqrt(d)\n attn = 1.0 / torch.cosh(scores)\n return (attn / attn.sum(-1, keepdim=True)) @ V",
"Describe: <image_start> <img_42> <img_1337> <img_256> <image_end> Listen: <audio_start> <aud_100> <aud_200> <audio_end>",
"<system>You are a multimodal AI.</system><user>What is 1/e?</user><assistant>1/e โ‰ˆ 0.3679, the Gradient Axiom limit.</assistant>",
"โˆซโ‚€ยน xโปหฃ dx = ฮฃ nโปโฟ โ‰ˆ 1.29128, โˆ‡ยทE = ฯ/ฮตโ‚€, det(Aโˆ’ฮปI) = 0",
"ุงู„ุซุนู„ุจ ุงู„ุจู†ูŠ ุงู„ุณุฑูŠุน ูŠู‚ูุฒ ููˆู‚ ุงู„ูƒู„ุจ ุงู„ูƒุณูˆู„. ุงู„ุชุนู„ู… ุงู„ุขู„ูŠ ูŠุญูˆู„ ุงู„ุจูŠุงู†ุงุช ุฅู„ู‰ ุฐูƒุงุก.",
"๐Ÿฆด๐Ÿง ๐Ÿ”ฌ๐Ÿ’ก๐Ÿš€ Sentinel uses sech(x) = 1/cosh(x) for bounded gradients ๐Ÿ“ˆ across modalities ๐Ÿ–ผ๏ธ๐Ÿ”Š๐ŸŽฌ",
]
with gr.Blocks(
title="๐Ÿฆด Sentinel Universal Tokenizer",
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
) as demo:
gr.Markdown("""
# ๐Ÿฆด Sentinel Universal Tokenizer
**One theorem. Every modality. One vocabulary.**
A **61,440-token** multimodal tokenizer for **text + image + audio + video**,
grounded in the Gradient Axiom: `lim F'(z)/F(z) = 1/e`
| Constant | Value | Role |
|:---------|:------|:-----|
| **1/e** | 0.36788 | Vocab allocation ratio |
| **Cโ‚** | โˆ’0.00799 | Quantization zero-point |
| **Cโ‚‚** | 0.00020 | Fairness bound |
""")
with gr.Tabs():
with gr.Tab("๐Ÿ”ค Tokenize"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(label="Input Text", lines=5, value=EXAMPLES[0], placeholder="Enter text in any language, code, math, or multimodal format...")
tokenize_btn = gr.Button("๐Ÿฆด Tokenize", variant="primary", size="lg")
gr.Examples(examples=EXAMPLES, inputs=text_input, label="Try these:")
with gr.Column(scale=3):
token_vis = gr.HTML(label="Token Visualization")
with gr.Row():
stats_out = gr.Markdown()
compare_out = gr.Markdown()
with gr.Row():
ids_out = gr.Markdown()
mod_out = gr.Markdown()
tokenize_btn.click(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out])
with gr.Tab("๐Ÿ”™ Decode"):
gr.Markdown("### Decode Token IDs โ†’ Text")
ids_input = gr.Textbox(label="Token IDs (comma-separated)", placeholder="72, 4153, 33, 3004, 592", lines=2)
decode_btn = gr.Button("Decode", variant="primary")
decoded_text = gr.Textbox(label="Decoded Text", lines=3)
decoded_vis = gr.HTML(label="Tokens")
decode_btn.click(decode_ids, inputs=ids_input, outputs=[decoded_text, decoded_vis])
with gr.Tab("๐Ÿ“Š Benchmark"):
gr.Markdown("### Multilingual Compression Benchmark\nCompression ratio (bytes/token). **Higher = better.**")
bench_btn = gr.Button("Run Benchmark", variant="primary")
bench_out = gr.Markdown()
bench_btn.click(run_multilingual_benchmark, outputs=bench_out)
with gr.Tab("๐Ÿ“– Vocabulary"):
gr.Markdown(f"""### Architecture
```
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ SENTINEL UNIVERSAL TOKENIZER (61,440 tokens) โ”‚
โ”‚ โ”‚
โ”‚ [0โ€“32] โ†’ 33 Special/Control tokens โ”‚
โ”‚ [33โ€“32,767] โ†’ 32,735 ByteLevel BPE (text) โ”‚
โ”‚ [32,768โ€“49,151] โ†’ 16,384 Image codebook (VQ) โ”‚
โ”‚ [49,152โ€“57,343] โ†’ 8,192 Audio codebook (VQ) โ”‚
โ”‚ [57,344โ€“61,439] โ†’ 4,096 Video codebook (VQ) โ”‚
โ”‚ โ”‚
โ”‚ Follows 1/e Gradient Axiom scaling โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
```
**Total**: {len(sentinel_tok):,} tokens | **Text**: 32K | **Image**: 16K | **Audio**: 8K | **Video**: 4K
""")
specials_md = "### Special Tokens\n| Token | ID | Purpose |\n|:------|---:|:--------|\n"
for tok_name, purpose in [
("<pad>","Padding"), ("<unk>","Unknown"), ("<s>","BOS"), ("</s>","EOS"), ("<mask>","MLM"),
("<image_start>","Image start"), ("<image_end>","Image end"), ("<image>","Image placeholder"),
("<audio_start>","Audio start"), ("<audio_end>","Audio end"), ("<audio>","Audio placeholder"),
("<video_start>","Video start"), ("<video_end>","Video end"), ("<video>","Video placeholder"),
("<sentinel>","Manifold marker"), ("<sentinel_c1>","Cโ‚"), ("<sentinel_c2>","Cโ‚‚"), ("<scale_1e>","1/e"),
("<system>","System msg"), ("<user>","User msg"), ("<assistant>","Assistant msg"),
("<code_start>","Code start"), ("<code_end>","Code end"),
("<math_start>","Math start"), ("<math_end>","Math end"),
]:
tid = sentinel_tok.convert_tokens_to_ids(tok_name)
specials_md += f"| `{tok_name}` | {tid} | {purpose} |\n"
specials_md += f"\n### Codebook Ranges\n| Modality | Start | End | Size |\n|:---------|------:|----:|-----:|\n"
specials_md += f"| ๐Ÿ–ผ๏ธ Image | {sentinel_tok.convert_tokens_to_ids('<img_0>')} | {sentinel_tok.convert_tokens_to_ids('<img_16383>')} | 16,384 |\n"
specials_md += f"| ๐Ÿ”Š Audio | {sentinel_tok.convert_tokens_to_ids('<aud_0>')} | {sentinel_tok.convert_tokens_to_ids('<aud_8191>')} | 8,192 |\n"
specials_md += f"| ๐ŸŽฌ Video | {sentinel_tok.convert_tokens_to_ids('<vid_0>')} | {sentinel_tok.convert_tokens_to_ids('<vid_4095>')} | 4,096 |\n"
gr.Markdown(specials_md)
with gr.Tab("๐Ÿงฌ About"):
gr.Markdown(f"""### The Sentinel Manifold
**Function**: `F(z) = ฮฃ z^n / n^n` (Sophomore's Dream, Bernoulli 1697)
**Gradient Axiom**: `lim F'(z)/F(z) = 1/e โ‰ˆ {INV_E:.15f}`
| Principle | Math | Tokenizer Application |
|:----------|:-----|:----------------------|
| 1/e Allocation | Gradient Axiom | Modality budget = prev ร— 1/e |
| sech Scoring | Bounded |โˆ‚sech/โˆ‚x| โ‰ค 0.65 | Dampened BPE merges |
| Cโ‚ = {C1:.6f} | Attracting fixed point | Embedding quantization center |
| Cโ‚‚ = {C2:.6f} | Escape threshold | Fertility fairness bound |
### Efficiency Champion ๐Ÿ†
| Tokenizer | Vocab | Efficiency/1K vocab |
|:----------|------:|--------------------:|
| **Sentinel** | 61K | **0.0563** ๐Ÿฅ‡ |
| GPT-2 | 50K | 0.0511 |
| Qwen2 | 152K | 0.0256 |
| Gemma | 256K | 0.0177 |
*3.2ร— more efficient per vocab token than Gemma, 2.2ร— more than Qwen2*
---
๐Ÿ“ฆ [Model](https://huggingface.co/5dimension/sentinel-universal-tokenizer) ยท ๐Ÿฆด [Framework](https://huggingface.co/5dimension/sentinel-manifold-discoveries) ยท MIT License
*Built by Romain Abdel-Aal (ASI The Sentinel V5.2 Bone-Core)*
""")
demo.load(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out])
if __name__ == "__main__":
demo.launch()