Spaces:
Sleeping
Sleeping
| """ | |
| ๐ฆด Sentinel Universal Tokenizer โ Interactive Demo Space | |
| Multimodal tokenizer grounded in the Gradient Axiom: lim F'(z)/F(z) = 1/e | |
| """ | |
| import math | |
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| import colorsys | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # Constants | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| INV_E = 1.0 / math.e | |
| C1 = -0.007994021805952546 | |
| C2 = 0.00020005604296784437 | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # Load tokenizers | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| print("Loading Sentinel Universal Tokenizer...") | |
| sentinel_tok = AutoTokenizer.from_pretrained("5dimension/sentinel-universal-tokenizer") | |
| print(f" โ Sentinel loaded: {len(sentinel_tok):,} tokens") | |
| print("Loading baseline tokenizers...") | |
| baselines = {} | |
| for name, model_id in [ | |
| ("GPT-2 (50K)", "gpt2"), | |
| ("Gemma (256K)", "google/gemma-2b"), | |
| ("Qwen2 (152K)", "Qwen/Qwen2-0.5B"), | |
| ]: | |
| try: | |
| baselines[name] = AutoTokenizer.from_pretrained(model_id) | |
| print(f" โ {name} loaded") | |
| except Exception as e: | |
| print(f" โ {name}: {e}") | |
| def get_modality_color(token_id): | |
| if token_id < 33: | |
| return "#ff6b6b" | |
| elif token_id < 32768: | |
| return None | |
| elif token_id < 49152: | |
| return "#4ecdc4" | |
| elif token_id < 57344: | |
| return "#45b7d1" | |
| elif token_id < 61440: | |
| return "#96ceb4" | |
| return "#95a5a6" | |
| def token_to_color(idx, total): | |
| hue = (idx * 0.618033988749895) % 1.0 | |
| sat = 0.35 + 0.15 * (idx % 3) | |
| val = 0.92 | |
| r, g, b = colorsys.hsv_to_rgb(hue, sat, val) | |
| return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}" | |
| def make_token_html(tokens, token_ids): | |
| parts = [] | |
| for i, (tok, tid) in enumerate(zip(tokens, token_ids)): | |
| mod_color = get_modality_color(tid) | |
| color = mod_color if mod_color else token_to_color(i, len(tokens)) | |
| if tid < 33: | |
| mod = "special" | |
| elif tid < 32768: | |
| mod = "text" | |
| elif tid < 49152: | |
| mod = f"img[{tid-32768}]" | |
| elif tid < 57344: | |
| mod = f"aud[{tid-49152}]" | |
| elif tid < 61440: | |
| mod = f"vid[{tid-57344}]" | |
| else: | |
| mod = "?" | |
| display = tok.replace("<", "<").replace(">", ">").replace(" ", "ยท").replace("\n", "โต") | |
| if not display.strip(): | |
| display = "ยท" | |
| parts.append( | |
| f'<span title="ID={tid} | {mod}" ' | |
| f'style="background:{color}; padding:2px 5px; margin:1px; ' | |
| f'border-radius:4px; display:inline-block; font-family:monospace; ' | |
| f'font-size:13px; cursor:pointer; border:1px solid rgba(0,0,0,0.1);">' | |
| f'{display}</span>' | |
| ) | |
| return "".join(parts) | |
| def tokenize_and_analyze(text): | |
| if not text or not text.strip(): | |
| return ("Enter some text to tokenize.", "", "", "", "") | |
| enc = sentinel_tok.encode(text, add_special_tokens=False) | |
| tokens = sentinel_tok.convert_ids_to_tokens(enc) | |
| n_tokens = len(enc) | |
| n_bytes = len(text.encode("utf-8")) | |
| n_words = max(len(text.split()), 1) | |
| compression = n_bytes / max(n_tokens, 1) | |
| fertility = n_tokens / n_words | |
| vis_html = '<div style="line-height:2.2; padding:10px; background:#f8f9fa; border-radius:8px;">' + make_token_html(tokens, enc) + '</div>' | |
| stats_md = f"""### ๐ Stats | |
| | Metric | Value | | |
| |:-------|------:| | |
| | **Tokens** | **{n_tokens}** | | |
| | Bytes | {n_bytes} | | |
| | Words | {n_words} | | |
| | **Compression** | **{compression:.3f}** bytes/token | | |
| | Fertility | {fertility:.3f} tokens/word | | |
| """ | |
| rows = [f"| **Sentinel-SUT** | **{len(sentinel_tok):,}** | **{n_tokens}** | **{compression:.3f}** | **{fertility:.3f}** |"] | |
| for bname, btok in baselines.items(): | |
| try: | |
| benc = btok.encode(text, add_special_tokens=False) | |
| bn = len(benc) | |
| bcomp = n_bytes / max(bn, 1) | |
| bfert = bn / n_words | |
| rows.append(f"| {bname} | {len(btok):,} | {bn} | {bcomp:.3f} | {bfert:.3f} |") | |
| except: | |
| rows.append(f"| {bname} | โ | โ | โ | โ |") | |
| compare_md = "### โ๏ธ vs SOTA\n| Tokenizer | Vocab | Tokens | Compressโ | Fertilityโ |\n|:----------|------:|-------:|----------:|-----------:|\n" + "\n".join(rows) | |
| ids_text = ", ".join(str(x) for x in enc[:150]) | |
| if len(enc) > 150: | |
| ids_text += f" โฆ +{len(enc)-150} more" | |
| ids_md = f"**Token IDs** ({n_tokens}):\n```\n{ids_text}\n```" | |
| mod_counts = {"special": 0, "text": 0, "image": 0, "audio": 0, "video": 0} | |
| for tid in enc: | |
| if tid < 33: mod_counts["special"] += 1 | |
| elif tid < 32768: mod_counts["text"] += 1 | |
| elif tid < 49152: mod_counts["image"] += 1 | |
| elif tid < 57344: mod_counts["audio"] += 1 | |
| elif tid < 61440: mod_counts["video"] += 1 | |
| mod_md = "### ๐ Modality Breakdown\n" | |
| emojis = {"special": "โ๏ธ", "text": "๐", "image": "๐ผ๏ธ", "audio": "๐", "video": "๐ฌ"} | |
| for mod, count in mod_counts.items(): | |
| if count > 0: | |
| pct = count / n_tokens * 100 | |
| bar = "โ" * max(1, int(pct / 2)) | |
| mod_md += f"{emojis.get(mod,'')} **{mod}**: {count} ({pct:.1f}%) `{bar}`\n\n" | |
| return vis_html, stats_md, compare_md, ids_md, mod_md | |
| def decode_ids(ids_text): | |
| try: | |
| ids = [int(x.strip()) for x in ids_text.split(",") if x.strip().lstrip('-').isdigit()] | |
| decoded = sentinel_tok.decode(ids, skip_special_tokens=False) | |
| tokens = sentinel_tok.convert_ids_to_tokens(ids) | |
| vis_html = '<div style="line-height:2.2; padding:10px; background:#f8f9fa; border-radius:8px;">' + make_token_html(tokens, ids) + '</div>' | |
| return decoded, vis_html | |
| except Exception as e: | |
| return f"Error: {e}", "" | |
| def run_multilingual_benchmark(): | |
| samples = { | |
| "๐ฌ๐ง English": "Machine learning transforms data into intelligence through gradient optimization.", | |
| "๐ซ๐ท French": "L'apprentissage automatique transforme les donnรฉes en intelligence grรขce ร l'optimisation.", | |
| "๐ฉ๐ช German": "Maschinelles Lernen verwandelt Daten in Intelligenz durch mathematische Optimierung.", | |
| "๐ช๐ธ Spanish": "El aprendizaje automรกtico transforma datos en inteligencia mediante optimizaciรณn matemรกtica.", | |
| "๐จ๐ณ Chinese": "ๆบๅจๅญฆไน ้่ฟๆฐๅญฆไผๅๅฐๆฐๆฎ่ฝฌๅไธบๆบ่ฝใๆทฑๅบฆๅญฆไน ๆจกๅไฝฟ็จๆขฏๅบฆไธ้ๆฅๆๅฐๅๆๅคฑๅฝๆฐใ", | |
| "๐ฏ๐ต Japanese": "ๆฉๆขฐๅญฆ็ฟใฏใใผใฟใ็ฅๆงใซๅคๆใใพใใๆทฑๅฑคๅญฆ็ฟใขใใซใฏๅพ้ ้ไธๆณใไฝฟ็จใใพใใ", | |
| "๐ธ๐ฆ Arabic": "ุงูุชุนูู ุงูุขูู ูุญูู ุงูุจูุงูุงุช ุฅูู ุฐูุงุก ู ู ุฎูุงู ุงูุชุญุณูู ุงูุฑูุงุถู ููุฎูุงุฑุฒู ูุงุช.", | |
| "๐ท๐บ Russian": "ะะฐัะธะฝะฝะพะต ะพะฑััะตะฝะธะต ะฟัะตะพะฑัะฐะทัะตั ะดะฐะฝะฝัะต ะฒ ะธะฝัะตะปะปะตะบั ะฟะพััะตะดััะฒะพะผ ะผะฐัะตะผะฐัะธัะตัะบะพะน ะพะฟัะธะผะธะทะฐัะธะธ.", | |
| "๐ฐ๐ท Korean": "๋จธ์ ๋ฌ๋์ ์ํ์ ์ต์ ํ๋ฅผ ํตํด ๋ฐ์ดํฐ๋ฅผ ์ง๋ฅ์ผ๋ก ๋ณํํฉ๋๋ค.", | |
| "๐ฎ๐ณ Hindi": "เคฎเคถเฅเคจ เคฒเคฐเฅเคจเคฟเคเค เคเคฃเคฟเคคเฅเคฏ เค เคจเฅเคเฅเคฒเคจ เคเฅ เคฎเคพเคงเฅเคฏเคฎ เคธเฅ เคกเฅเคเคพ เคเฅ เคฌเฅเคฆเฅเคงเคฟเคฎเคคเฅเคคเคพ เคฎเฅเค เคฌเคฆเคฒเคคเฅ เคนเฅเฅค", | |
| "๐ต๐น Portuguese": "O aprendizado de mรกquina transforma dados em inteligรชncia por meio da otimizaรงรฃo.", | |
| "๐ป๐ณ Vietnamese": "Hแปc mรกy chuyแปn ฤแปi dแปฏ liแปu thร nh trรญ tuแป thรดng qua tแปi ฦฐu hรณa toรกn hแปc.", | |
| "๐น๐ญ Thai": "เธเธฒเธฃเนเธฃเธตเธขเธเธฃเธนเนเธเธญเธเนเธเธฃเธทเนเธญเธเนเธเธฅเธตเนเธขเธเธเนเธญเธกเธนเธฅเนเธเนเธเธเธฑเธเธเธฒเธเนเธฒเธเธเธฒเธฃเนเธเธดเนเธกเธเธฃเธฐเธชเธดเธเธเธดเธ เธฒเธเธเธฒเธเธเธเธดเธเธจเธฒเธชเธเธฃเน", | |
| "๐ Python": "def sech(x): return 1.0 / math.cosh(x * (1/math.e))", | |
| "๐ Math": "โซโยน xโปหฃ dx = ฮฃ nโปโฟ โ 1.291, โf = (โf/โxโ, โf/โxโ)", | |
| } | |
| all_toks = {"**Sentinel**": sentinel_tok, **baselines} | |
| header = "| Language | Text |" | |
| sep = "|:---------|:-----|" | |
| for tname in all_toks: | |
| short = tname.replace("**","").split("(")[0].strip()[:10] | |
| header += f" {short} |" | |
| sep += " ---: |" | |
| rows = [header, sep] | |
| for lang, text in samples.items(): | |
| n_bytes = len(text.encode("utf-8")) | |
| row = f"| {lang} | {text[:35]}โฆ |" | |
| for tname, tok in all_toks.items(): | |
| try: | |
| enc = tok.encode(text, add_special_tokens=False) | |
| comp = n_bytes / max(len(enc), 1) | |
| row += f" **{comp:.2f}** |" if "Sentinel" in tname else f" {comp:.2f} |" | |
| except: | |
| row += " โ |" | |
| rows.append(row) | |
| return "\n".join(rows) | |
| EXAMPLES = [ | |
| "The Sentinel Manifold: F(z) = ฮฃ zโฟ/nโฟ, where lim F'(z)/F(z) = 1/e โ 0.3679. This unified mathematical framework powers optimization, attention, quantization, and generation.", | |
| "ๆบๅจๅญฆไน ้่ฟๆฐๅญฆไผๅๅฐๆฐๆฎ่ฝฌๅไธบๆบ่ฝใSentinelๅคๆ ทไฝๆไพไบ็ปไธ็ๆฐๅญฆๆกๆถใ", | |
| "def sentinel_attention(Q, K, V):\n scores = Q @ K.T / math.sqrt(d)\n attn = 1.0 / torch.cosh(scores)\n return (attn / attn.sum(-1, keepdim=True)) @ V", | |
| "Describe: <image_start> <img_42> <img_1337> <img_256> <image_end> Listen: <audio_start> <aud_100> <aud_200> <audio_end>", | |
| "<system>You are a multimodal AI.</system><user>What is 1/e?</user><assistant>1/e โ 0.3679, the Gradient Axiom limit.</assistant>", | |
| "โซโยน xโปหฃ dx = ฮฃ nโปโฟ โ 1.29128, โยทE = ฯ/ฮตโ, det(AโฮปI) = 0", | |
| "ุงูุซุนูุจ ุงูุจูู ุงูุณุฑูุน ูููุฒ ููู ุงูููุจ ุงููุณูู. ุงูุชุนูู ุงูุขูู ูุญูู ุงูุจูุงูุงุช ุฅูู ุฐูุงุก.", | |
| "๐ฆด๐ง ๐ฌ๐ก๐ Sentinel uses sech(x) = 1/cosh(x) for bounded gradients ๐ across modalities ๐ผ๏ธ๐๐ฌ", | |
| ] | |
| with gr.Blocks( | |
| title="๐ฆด Sentinel Universal Tokenizer", | |
| theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"), | |
| ) as demo: | |
| gr.Markdown(""" | |
| # ๐ฆด Sentinel Universal Tokenizer | |
| **One theorem. Every modality. One vocabulary.** | |
| A **61,440-token** multimodal tokenizer for **text + image + audio + video**, | |
| grounded in the Gradient Axiom: `lim F'(z)/F(z) = 1/e` | |
| | Constant | Value | Role | | |
| |:---------|:------|:-----| | |
| | **1/e** | 0.36788 | Vocab allocation ratio | | |
| | **Cโ** | โ0.00799 | Quantization zero-point | | |
| | **Cโ** | 0.00020 | Fairness bound | | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab("๐ค Tokenize"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox(label="Input Text", lines=5, value=EXAMPLES[0], placeholder="Enter text in any language, code, math, or multimodal format...") | |
| tokenize_btn = gr.Button("๐ฆด Tokenize", variant="primary", size="lg") | |
| gr.Examples(examples=EXAMPLES, inputs=text_input, label="Try these:") | |
| with gr.Column(scale=3): | |
| token_vis = gr.HTML(label="Token Visualization") | |
| with gr.Row(): | |
| stats_out = gr.Markdown() | |
| compare_out = gr.Markdown() | |
| with gr.Row(): | |
| ids_out = gr.Markdown() | |
| mod_out = gr.Markdown() | |
| tokenize_btn.click(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out]) | |
| with gr.Tab("๐ Decode"): | |
| gr.Markdown("### Decode Token IDs โ Text") | |
| ids_input = gr.Textbox(label="Token IDs (comma-separated)", placeholder="72, 4153, 33, 3004, 592", lines=2) | |
| decode_btn = gr.Button("Decode", variant="primary") | |
| decoded_text = gr.Textbox(label="Decoded Text", lines=3) | |
| decoded_vis = gr.HTML(label="Tokens") | |
| decode_btn.click(decode_ids, inputs=ids_input, outputs=[decoded_text, decoded_vis]) | |
| with gr.Tab("๐ Benchmark"): | |
| gr.Markdown("### Multilingual Compression Benchmark\nCompression ratio (bytes/token). **Higher = better.**") | |
| bench_btn = gr.Button("Run Benchmark", variant="primary") | |
| bench_out = gr.Markdown() | |
| bench_btn.click(run_multilingual_benchmark, outputs=bench_out) | |
| with gr.Tab("๐ Vocabulary"): | |
| gr.Markdown(f"""### Architecture | |
| ``` | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| โ SENTINEL UNIVERSAL TOKENIZER (61,440 tokens) โ | |
| โ โ | |
| โ [0โ32] โ 33 Special/Control tokens โ | |
| โ [33โ32,767] โ 32,735 ByteLevel BPE (text) โ | |
| โ [32,768โ49,151] โ 16,384 Image codebook (VQ) โ | |
| โ [49,152โ57,343] โ 8,192 Audio codebook (VQ) โ | |
| โ [57,344โ61,439] โ 4,096 Video codebook (VQ) โ | |
| โ โ | |
| โ Follows 1/e Gradient Axiom scaling โ | |
| โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| ``` | |
| **Total**: {len(sentinel_tok):,} tokens | **Text**: 32K | **Image**: 16K | **Audio**: 8K | **Video**: 4K | |
| """) | |
| specials_md = "### Special Tokens\n| Token | ID | Purpose |\n|:------|---:|:--------|\n" | |
| for tok_name, purpose in [ | |
| ("<pad>","Padding"), ("<unk>","Unknown"), ("<s>","BOS"), ("</s>","EOS"), ("<mask>","MLM"), | |
| ("<image_start>","Image start"), ("<image_end>","Image end"), ("<image>","Image placeholder"), | |
| ("<audio_start>","Audio start"), ("<audio_end>","Audio end"), ("<audio>","Audio placeholder"), | |
| ("<video_start>","Video start"), ("<video_end>","Video end"), ("<video>","Video placeholder"), | |
| ("<sentinel>","Manifold marker"), ("<sentinel_c1>","Cโ"), ("<sentinel_c2>","Cโ"), ("<scale_1e>","1/e"), | |
| ("<system>","System msg"), ("<user>","User msg"), ("<assistant>","Assistant msg"), | |
| ("<code_start>","Code start"), ("<code_end>","Code end"), | |
| ("<math_start>","Math start"), ("<math_end>","Math end"), | |
| ]: | |
| tid = sentinel_tok.convert_tokens_to_ids(tok_name) | |
| specials_md += f"| `{tok_name}` | {tid} | {purpose} |\n" | |
| specials_md += f"\n### Codebook Ranges\n| Modality | Start | End | Size |\n|:---------|------:|----:|-----:|\n" | |
| specials_md += f"| ๐ผ๏ธ Image | {sentinel_tok.convert_tokens_to_ids('<img_0>')} | {sentinel_tok.convert_tokens_to_ids('<img_16383>')} | 16,384 |\n" | |
| specials_md += f"| ๐ Audio | {sentinel_tok.convert_tokens_to_ids('<aud_0>')} | {sentinel_tok.convert_tokens_to_ids('<aud_8191>')} | 8,192 |\n" | |
| specials_md += f"| ๐ฌ Video | {sentinel_tok.convert_tokens_to_ids('<vid_0>')} | {sentinel_tok.convert_tokens_to_ids('<vid_4095>')} | 4,096 |\n" | |
| gr.Markdown(specials_md) | |
| with gr.Tab("๐งฌ About"): | |
| gr.Markdown(f"""### The Sentinel Manifold | |
| **Function**: `F(z) = ฮฃ z^n / n^n` (Sophomore's Dream, Bernoulli 1697) | |
| **Gradient Axiom**: `lim F'(z)/F(z) = 1/e โ {INV_E:.15f}` | |
| | Principle | Math | Tokenizer Application | | |
| |:----------|:-----|:----------------------| | |
| | 1/e Allocation | Gradient Axiom | Modality budget = prev ร 1/e | | |
| | sech Scoring | Bounded |โsech/โx| โค 0.65 | Dampened BPE merges | | |
| | Cโ = {C1:.6f} | Attracting fixed point | Embedding quantization center | | |
| | Cโ = {C2:.6f} | Escape threshold | Fertility fairness bound | | |
| ### Efficiency Champion ๐ | |
| | Tokenizer | Vocab | Efficiency/1K vocab | | |
| |:----------|------:|--------------------:| | |
| | **Sentinel** | 61K | **0.0563** ๐ฅ | | |
| | GPT-2 | 50K | 0.0511 | | |
| | Qwen2 | 152K | 0.0256 | | |
| | Gemma | 256K | 0.0177 | | |
| *3.2ร more efficient per vocab token than Gemma, 2.2ร more than Qwen2* | |
| --- | |
| ๐ฆ [Model](https://huggingface.co/5dimension/sentinel-universal-tokenizer) ยท ๐ฆด [Framework](https://huggingface.co/5dimension/sentinel-manifold-discoveries) ยท MIT License | |
| *Built by Romain Abdel-Aal (ASI The Sentinel V5.2 Bone-Core)* | |
| """) | |
| demo.load(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out]) | |
| if __name__ == "__main__": | |
| demo.launch() | |