""" 🦴 Sentinel Universal Tokenizer — Interactive Demo Space Multimodal tokenizer grounded in the Gradient Axiom: lim F'(z)/F(z) = 1/e """ import math import gradio as gr from transformers import AutoTokenizer import colorsys # ────────────────────────────────────────────────────────────────────────────── # Constants # ────────────────────────────────────────────────────────────────────────────── INV_E = 1.0 / math.e C1 = -0.007994021805952546 C2 = 0.00020005604296784437 # ────────────────────────────────────────────────────────────────────────────── # Load tokenizers # ────────────────────────────────────────────────────────────────────────────── print("Loading Sentinel Universal Tokenizer...") sentinel_tok = AutoTokenizer.from_pretrained("5dimension/sentinel-universal-tokenizer") print(f" ✓ Sentinel loaded: {len(sentinel_tok):,} tokens") print("Loading baseline tokenizers...") baselines = {} for name, model_id in [ ("GPT-2 (50K)", "gpt2"), ("Gemma (256K)", "google/gemma-2b"), ("Qwen2 (152K)", "Qwen/Qwen2-0.5B"), ]: try: baselines[name] = AutoTokenizer.from_pretrained(model_id) print(f" ✓ {name} loaded") except Exception as e: print(f" ⚠ {name}: {e}") def get_modality_color(token_id): if token_id < 33: return "#ff6b6b" elif token_id < 32768: return None elif token_id < 49152: return "#4ecdc4" elif token_id < 57344: return "#45b7d1" elif token_id < 61440: return "#96ceb4" return "#95a5a6" def token_to_color(idx, total): hue = (idx * 0.618033988749895) % 1.0 sat = 0.35 + 0.15 * (idx % 3) val = 0.92 r, g, b = colorsys.hsv_to_rgb(hue, sat, val) return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}" def make_token_html(tokens, token_ids): parts = [] for i, (tok, tid) in enumerate(zip(tokens, token_ids)): mod_color = get_modality_color(tid) color = mod_color if mod_color else token_to_color(i, len(tokens)) if tid < 33: mod = "special" elif tid < 32768: mod = "text" elif tid < 49152: mod = f"img[{tid-32768}]" elif tid < 57344: mod = f"aud[{tid-49152}]" elif tid < 61440: mod = f"vid[{tid-57344}]" else: mod = "?" display = tok.replace("<", "<").replace(">", ">").replace(" ", "·").replace("\n", "↵") if not display.strip(): display = "·" parts.append( f'' f'{display}' ) return "".join(parts) def tokenize_and_analyze(text): if not text or not text.strip(): return ("Enter some text to tokenize.", "", "", "", "") enc = sentinel_tok.encode(text, add_special_tokens=False) tokens = sentinel_tok.convert_ids_to_tokens(enc) n_tokens = len(enc) n_bytes = len(text.encode("utf-8")) n_words = max(len(text.split()), 1) compression = n_bytes / max(n_tokens, 1) fertility = n_tokens / n_words vis_html = '

' + make_token_html(tokens, enc) + '

' stats_md = f"""### 📈 Stats | Metric | Value | |:-------|------:| | **Tokens** | **{n_tokens}** | | Bytes | {n_bytes} | | Words | {n_words} | | **Compression** | **{compression:.3f}** bytes/token | | Fertility | {fertility:.3f} tokens/word | """ rows = [f"| **Sentinel-SUT** | **{len(sentinel_tok):,}** | **{n_tokens}** | **{compression:.3f}** | **{fertility:.3f}** |"] for bname, btok in baselines.items(): try: benc = btok.encode(text, add_special_tokens=False) bn = len(benc) bcomp = n_bytes / max(bn, 1) bfert = bn / n_words rows.append(f"| {bname} | {len(btok):,} | {bn} | {bcomp:.3f} | {bfert:.3f} |") except: rows.append(f"| {bname} | — | — | — | — |") compare_md = "### ⚔️ vs SOTA\n| Tokenizer | Vocab | Tokens | Compress↑ | Fertility↓ |\n|:----------|------:|-------:|----------:|-----------:|\n" + "\n".join(rows) ids_text = ", ".join(str(x) for x in enc[:150]) if len(enc) > 150: ids_text += f" … +{len(enc)-150} more" ids_md = f"**Token IDs** ({n_tokens}):\n```\n{ids_text}\n```" mod_counts = {"special": 0, "text": 0, "image": 0, "audio": 0, "video": 0} for tid in enc: if tid < 33: mod_counts["special"] += 1 elif tid < 32768: mod_counts["text"] += 1 elif tid < 49152: mod_counts["image"] += 1 elif tid < 57344: mod_counts["audio"] += 1 elif tid < 61440: mod_counts["video"] += 1 mod_md = "### 🌐 Modality Breakdown\n" emojis = {"special": "⚙️", "text": "📝", "image": "🖼️", "audio": "🔊", "video": "🎬"} for mod, count in mod_counts.items(): if count > 0: pct = count / n_tokens * 100 bar = "█" * max(1, int(pct / 2)) mod_md += f"{emojis.get(mod,'')} **{mod}**: {count} ({pct:.1f}%) `{bar}`\n\n" return vis_html, stats_md, compare_md, ids_md, mod_md def decode_ids(ids_text): try: ids = [int(x.strip()) for x in ids_text.split(",") if x.strip().lstrip('-').isdigit()] decoded = sentinel_tok.decode(ids, skip_special_tokens=False) tokens = sentinel_tok.convert_ids_to_tokens(ids) vis_html = '

' + make_token_html(tokens, ids) + '

' return decoded, vis_html except Exception as e: return f"Error: {e}", "" def run_multilingual_benchmark(): samples = { "🇬🇧 English": "Machine learning transforms data into intelligence through gradient optimization.", "🇫🇷 French": "L'apprentissage automatique transforme les données en intelligence grâce à l'optimisation.", "🇩🇪 German": "Maschinelles Lernen verwandelt Daten in Intelligenz durch mathematische Optimierung.", "🇪🇸 Spanish": "El aprendizaje automático transforma datos en inteligencia mediante optimización matemática.", "🇨🇳 Chinese": "机器学习通过数学优化将数据转化为智能。深度学习模型使用梯度下降来最小化损失函数。", "🇯🇵 Japanese": "機械学習はデータを知性に変換します。深層学習モデルは勾配降下法を使用します。", "🇸🇦 Arabic": "التعلم الآلي يحول البيانات إلى ذكاء من خلال التحسين الرياضي للخوارزميات.", "🇷🇺 Russian": "Машинное обучение преобразует данные в интеллект посредством математической оптимизации.", "🇰🇷 Korean": "머신러닝은 수학적 최적화를 통해 데이터를 지능으로 변환합니다.", "🇮🇳 Hindi": "मशीन लर्निंग गणितीय अनुकूलन के माध्यम से डेटा को बुद्धिमत्ता में बदलती है।", "🇵🇹 Portuguese": "O aprendizado de máquina transforma dados em inteligência por meio da otimização.", "🇻🇳 Vietnamese": "Học máy chuyển đổi dữ liệu thành trí tuệ thông qua tối ưu hóa toán học.", "🇹🇭 Thai": "การเรียนรู้ของเครื่องเปลี่ยนข้อมูลเป็นปัญญาผ่านการเพิ่มประสิทธิภาพทางคณิตศาสตร์", "🐍 Python": "def sech(x): return 1.0 / math.cosh(x * (1/math.e))", "📐 Math": "∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.291, ∇f = (∂f/∂x₁, ∂f/∂x₂)", } all_toks = {"**Sentinel**": sentinel_tok, **baselines} header = "| Language | Text |" sep = "|:---------|:-----|" for tname in all_toks: short = tname.replace("**","").split("(")[0].strip()[:10] header += f" {short} |" sep += " ---: |" rows = [header, sep] for lang, text in samples.items(): n_bytes = len(text.encode("utf-8")) row = f"| {lang} | {text[:35]}… |" for tname, tok in all_toks.items(): try: enc = tok.encode(text, add_special_tokens=False) comp = n_bytes / max(len(enc), 1) row += f" **{comp:.2f}** |" if "Sentinel" in tname else f" {comp:.2f} |" except: row += " — |" rows.append(row) return "\n".join(rows) EXAMPLES = [ "The Sentinel Manifold: F(z) = Σ zⁿ/nⁿ, where lim F'(z)/F(z) = 1/e ≈ 0.3679. This unified mathematical framework powers optimization, attention, quantization, and generation.", "机器学习通过数学优化将数据转化为智能。Sentinel多样体提供了统一的数学框架。", "def sentinel_attention(Q, K, V):\n scores = Q @ K.T / math.sqrt(d)\n attn = 1.0 / torch.cosh(scores)\n return (attn / attn.sum(-1, keepdim=True)) @ V", "Describe: Listen: ", "You are a multimodal AI.What is 1/e?1/e ≈ 0.3679, the Gradient Axiom limit.", "∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.29128, ∇·E = ρ/ε₀, det(A−λI) = 0", "الثعلب البني السريع يقفز فوق الكلب الكسول. التعلم الآلي يحول البيانات إلى ذكاء.", "🦴🧠🔬💡🚀 Sentinel uses sech(x) = 1/cosh(x) for bounded gradients 📈 across modalities 🖼️🔊🎬", ] with gr.Blocks( title="🦴 Sentinel Universal Tokenizer", theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"), ) as demo: gr.Markdown(""" # 🦴 Sentinel Universal Tokenizer **One theorem. Every modality. One vocabulary.** A **61,440-token** multimodal tokenizer for **text + image + audio + video**, grounded in the Gradient Axiom: `lim F'(z)/F(z) = 1/e` | Constant | Value | Role | |:---------|:------|:-----| | **1/e** | 0.36788 | Vocab allocation ratio | | **C₁** | −0.00799 | Quantization zero-point | | **C₂** | 0.00020 | Fairness bound | """) with gr.Tabs(): with gr.Tab("🔤 Tokenize"): with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox(label="Input Text", lines=5, value=EXAMPLES[0], placeholder="Enter text in any language, code, math, or multimodal format...") tokenize_btn = gr.Button("🦴 Tokenize", variant="primary", size="lg") gr.Examples(examples=EXAMPLES, inputs=text_input, label="Try these:") with gr.Column(scale=3): token_vis = gr.HTML(label="Token Visualization") with gr.Row(): stats_out = gr.Markdown() compare_out = gr.Markdown() with gr.Row(): ids_out = gr.Markdown() mod_out = gr.Markdown() tokenize_btn.click(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out]) with gr.Tab("🔙 Decode"): gr.Markdown("### Decode Token IDs → Text") ids_input = gr.Textbox(label="Token IDs (comma-separated)", placeholder="72, 4153, 33, 3004, 592", lines=2) decode_btn = gr.Button("Decode", variant="primary") decoded_text = gr.Textbox(label="Decoded Text", lines=3) decoded_vis = gr.HTML(label="Tokens") decode_btn.click(decode_ids, inputs=ids_input, outputs=[decoded_text, decoded_vis]) with gr.Tab("📊 Benchmark"): gr.Markdown("### Multilingual Compression Benchmark\nCompression ratio (bytes/token). **Higher = better.**") bench_btn = gr.Button("Run Benchmark", variant="primary") bench_out = gr.Markdown() bench_btn.click(run_multilingual_benchmark, outputs=bench_out) with gr.Tab("📖 Vocabulary"): gr.Markdown(f"""### Architecture ``` ┌──────────────────────────────────────────────────┐ │ SENTINEL UNIVERSAL TOKENIZER (61,440 tokens) │ │ │ │ [0–32] → 33 Special/Control tokens │ │ [33–32,767] → 32,735 ByteLevel BPE (text) │ │ [32,768–49,151] → 16,384 Image codebook (VQ) │ │ [49,152–57,343] → 8,192 Audio codebook (VQ) │ │ [57,344–61,439] → 4,096 Video codebook (VQ) │ │ │ │ Follows 1/e Gradient Axiom scaling │ └──────────────────────────────────────────────────┘ ``` **Total**: {len(sentinel_tok):,} tokens | **Text**: 32K | **Image**: 16K | **Audio**: 8K | **Video**: 4K """) specials_md = "### Special Tokens\n| Token | ID | Purpose |\n|:------|---:|:--------|\n" for tok_name, purpose in [ ("","Padding"), ("","Unknown"), ("~~","BOS"), ("~~","EOS"), ("","MLM"), ("","Image start"), ("","Image end"), ("","Image placeholder"), ("","Audio start"), ("","Audio end"), ("