Spaces:

5dimension
/

sentinel-tokenizer-space

Sleeping

App Files Files Community

sentinel-tokenizer-space / app.py

5dimension

Add Gradio app for Sentinel Universal Tokenizer demo

8c14443 verified 27 days ago

raw

history blame contribute delete

17.2 kB

	"""
	🦴 Sentinel Universal Tokenizer — Interactive Demo Space
	Multimodal tokenizer grounded in the Gradient Axiom: lim F'(z)/F(z) = 1/e
	"""

	import math
	import gradio as gr
	from transformers import AutoTokenizer
	import colorsys

	# ──────────────────────────────────────────────────────────────────────────────
	# Constants
	# ──────────────────────────────────────────────────────────────────────────────
	INV_E = 1.0 / math.e
	C1 = -0.007994021805952546
	C2 = 0.00020005604296784437

	# ──────────────────────────────────────────────────────────────────────────────
	# Load tokenizers
	# ──────────────────────────────────────────────────────────────────────────────
	print("Loading Sentinel Universal Tokenizer...")
	sentinel_tok = AutoTokenizer.from_pretrained("5dimension/sentinel-universal-tokenizer")
	print(f" ✓ Sentinel loaded: {len(sentinel_tok):,} tokens")

	print("Loading baseline tokenizers...")
	baselines = {}
	for name, model_id in [
	("GPT-2 (50K)", "gpt2"),
	("Gemma (256K)", "google/gemma-2b"),
	("Qwen2 (152K)", "Qwen/Qwen2-0.5B"),
	]:
	try:
	baselines[name] = AutoTokenizer.from_pretrained(model_id)
	print(f" ✓ {name} loaded")
	except Exception as e:
	print(f" ⚠ {name}: {e}")


	def get_modality_color(token_id):
	if token_id < 33:
	return "#ff6b6b"
	elif token_id < 32768:
	return None
	elif token_id < 49152:
	return "#4ecdc4"
	elif token_id < 57344:
	return "#45b7d1"
	elif token_id < 61440:
	return "#96ceb4"
	return "#95a5a6"


	def token_to_color(idx, total):
	hue = (idx * 0.618033988749895) % 1.0
	sat = 0.35 + 0.15 * (idx % 3)
	val = 0.92
	r, g, b = colorsys.hsv_to_rgb(hue, sat, val)
	return f"#{int(r255):02x}{int(g255):02x}{int(b*255):02x}"


	def make_token_html(tokens, token_ids):
	parts = []
	for i, (tok, tid) in enumerate(zip(tokens, token_ids)):
	mod_color = get_modality_color(tid)
	color = mod_color if mod_color else token_to_color(i, len(tokens))
	if tid < 33:
	mod = "special"
	elif tid < 32768:
	mod = "text"
	elif tid < 49152:
	mod = f"img[{tid-32768}]"
	elif tid < 57344:
	mod = f"aud[{tid-49152}]"
	elif tid < 61440:
	mod = f"vid[{tid-57344}]"
	else:
	mod = "?"
	display = tok.replace("<", "<").replace(">", ">").replace(" ", "·").replace("\n", "↵")
	if not display.strip():
	display = "·"
	parts.append(
	f'<span title="ID={tid} \| {mod}" '
	f'style="background:{color}; padding:2px 5px; margin:1px; '
	f'border-radius:4px; display:inline-block; font-family:monospace; '
	f'font-size:13px; cursor:pointer; border:1px solid rgba(0,0,0,0.1);">'
	f'{display}</span>'
	)
	return "".join(parts)


	def tokenize_and_analyze(text):
	if not text or not text.strip():
	return ("Enter some text to tokenize.", "", "", "", "")
	enc = sentinel_tok.encode(text, add_special_tokens=False)
	tokens = sentinel_tok.convert_ids_to_tokens(enc)
	n_tokens = len(enc)
	n_bytes = len(text.encode("utf-8"))
	n_words = max(len(text.split()), 1)
	compression = n_bytes / max(n_tokens, 1)
	fertility = n_tokens / n_words

	vis_html = '<div style="line-height:2.2; padding:10px; background:#f8f9fa; border-radius:8px;">' + make_token_html(tokens, enc) + '</div>'

	stats_md = f"""### 📈 Stats
	\| Metric \| Value \|
	\|:-------\|------:\|
	\| Tokens \| {n_tokens} \|
	\| Bytes \| {n_bytes} \|
	\| Words \| {n_words} \|
	\| Compression \| {compression:.3f} bytes/token \|
	\| Fertility \| {fertility:.3f} tokens/word \|
	"""

	rows = [f"\| Sentinel-SUT \| {len(sentinel_tok):,} \| {n_tokens} \| {compression:.3f} \| {fertility:.3f} \|"]
	for bname, btok in baselines.items():
	try:
	benc = btok.encode(text, add_special_tokens=False)
	bn = len(benc)
	bcomp = n_bytes / max(bn, 1)
	bfert = bn / n_words
	rows.append(f"\| {bname} \| {len(btok):,} \| {bn} \| {bcomp:.3f} \| {bfert:.3f} \|")
	except:
	rows.append(f"\| {bname} \| — \| — \| — \| — \|")
	compare_md = "### ⚔️ vs SOTA\n\| Tokenizer \| Vocab \| Tokens \| Compress↑ \| Fertility↓ \|\n\|:----------\|------:\|-------:\|----------:\|-----------:\|\n" + "\n".join(rows)

	ids_text = ", ".join(str(x) for x in enc[:150])
	if len(enc) > 150:
	ids_text += f" … +{len(enc)-150} more"
	ids_md = f"Token IDs ({n_tokens}):\n```\n{ids_text}\n```"

	mod_counts = {"special": 0, "text": 0, "image": 0, "audio": 0, "video": 0}
	for tid in enc:
	if tid < 33: mod_counts["special"] += 1
	elif tid < 32768: mod_counts["text"] += 1
	elif tid < 49152: mod_counts["image"] += 1
	elif tid < 57344: mod_counts["audio"] += 1
	elif tid < 61440: mod_counts["video"] += 1
	mod_md = "### 🌐 Modality Breakdown\n"
	emojis = {"special": "⚙️", "text": "📝", "image": "🖼️", "audio": "🔊", "video": "🎬"}
	for mod, count in mod_counts.items():
	if count > 0:
	pct = count / n_tokens * 100
	bar = "█" * max(1, int(pct / 2))
	mod_md += f"{emojis.get(mod,'')} {mod}: {count} ({pct:.1f}%) `{bar}`\n\n"

	return vis_html, stats_md, compare_md, ids_md, mod_md


	def decode_ids(ids_text):
	try:
	ids = [int(x.strip()) for x in ids_text.split(",") if x.strip().lstrip('-').isdigit()]
	decoded = sentinel_tok.decode(ids, skip_special_tokens=False)
	tokens = sentinel_tok.convert_ids_to_tokens(ids)
	vis_html = '<div style="line-height:2.2; padding:10px; background:#f8f9fa; border-radius:8px;">' + make_token_html(tokens, ids) + '</div>'
	return decoded, vis_html
	except Exception as e:
	return f"Error: {e}", ""


	def run_multilingual_benchmark():
	samples = {
	"🇬🇧 English": "Machine learning transforms data into intelligence through gradient optimization.",
	"🇫🇷 French": "L'apprentissage automatique transforme les données en intelligence grâce à l'optimisation.",
	"🇩🇪 German": "Maschinelles Lernen verwandelt Daten in Intelligenz durch mathematische Optimierung.",
	"🇪🇸 Spanish": "El aprendizaje automático transforma datos en inteligencia mediante optimización matemática.",
	"🇨🇳 Chinese": "机器学习通过数学优化将数据转化为智能。深度学习模型使用梯度下降来最小化损失函数。",
	"🇯🇵 Japanese": "機械学習はデータを知性に変換します。深層学習モデルは勾配降下法を使用します。",
	"🇸🇦 Arabic": "التعلم الآلي يحول البيانات إلى ذكاء من خلال التحسين الرياضي للخوارزميات.",
	"🇷🇺 Russian": "Машинное обучение преобразует данные в интеллект посредством математической оптимизации.",
	"🇰🇷 Korean": "머신러닝은 수학적 최적화를 통해 데이터를 지능으로 변환합니다.",
	"🇮🇳 Hindi": "मशीन लर्निंग गणितीय अनुकूलन के माध्यम से डेटा को बुद्धिमत्ता में बदलती है।",
	"🇵🇹 Portuguese": "O aprendizado de máquina transforma dados em inteligência por meio da otimização.",
	"🇻🇳 Vietnamese": "Học máy chuyển đổi dữ liệu thành trí tuệ thông qua tối ưu hóa toán học.",
	"🇹🇭 Thai": "การเรียนรู้ของเครื่องเปลี่ยนข้อมูลเป็นปัญญาผ่านการเพิ่มประสิทธิภาพทางคณิตศาสตร์",
	"🐍 Python": "def sech(x): return 1.0 / math.cosh(x * (1/math.e))",
	"📐 Math": "∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.291, ∇f = (∂f/∂x₁, ∂f/∂x₂)",
	}
	all_toks = {"Sentinel": sentinel_tok, **baselines}
	header = "\| Language \| Text \|"
	sep = "\|:---------\|:-----\|"
	for tname in all_toks:
	short = tname.replace("**","").split("(")[0].strip()[:10]
	header += f" {short} \|"
	sep += " ---: \|"
	rows = [header, sep]
	for lang, text in samples.items():
	n_bytes = len(text.encode("utf-8"))
	row = f"\| {lang} \| {text[:35]}… \|"
	for tname, tok in all_toks.items():
	try:
	enc = tok.encode(text, add_special_tokens=False)
	comp = n_bytes / max(len(enc), 1)
	row += f" {comp:.2f} \|" if "Sentinel" in tname else f" {comp:.2f} \|"
	except:
	row += " — \|"
	rows.append(row)
	return "\n".join(rows)


	EXAMPLES = [
	"The Sentinel Manifold: F(z) = Σ zⁿ/nⁿ, where lim F'(z)/F(z) = 1/e ≈ 0.3679. This unified mathematical framework powers optimization, attention, quantization, and generation.",
	"机器学习通过数学优化将数据转化为智能。Sentinel多样体提供了统一的数学框架。",
	"def sentinel_attention(Q, K, V):\n scores = Q @ K.T / math.sqrt(d)\n attn = 1.0 / torch.cosh(scores)\n return (attn / attn.sum(-1, keepdim=True)) @ V",
	"Describe: <image_start> <img_42> <img_1337> <img_256> <image_end> Listen: <audio_start> <aud_100> <aud_200> <audio_end>",
	"<system>You are a multimodal AI.</system><user>What is 1/e?</user><assistant>1/e ≈ 0.3679, the Gradient Axiom limit.</assistant>",
	"∫₀¹ x⁻ˣ dx = Σ n⁻ⁿ ≈ 1.29128, ∇·E = ρ/ε₀, det(A−λI) = 0",
	"الثعلب البني السريع يقفز فوق الكلب الكسول. التعلم الآلي يحول البيانات إلى ذكاء.",
	"🦴🧠🔬💡🚀 Sentinel uses sech(x) = 1/cosh(x) for bounded gradients 📈 across modalities 🖼️🔊🎬",
	]

	with gr.Blocks(
	title="🦴 Sentinel Universal Tokenizer",
	theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
	) as demo:

	gr.Markdown("""
	# 🦴 Sentinel Universal Tokenizer

	One theorem. Every modality. One vocabulary.

	A 61,440-token multimodal tokenizer for text + image + audio + video,
	grounded in the Gradient Axiom: `lim F'(z)/F(z) = 1/e`

	\| Constant \| Value \| Role \|
	\|:---------\|:------\|:-----\|
	\| 1/e \| 0.36788 \| Vocab allocation ratio \|
	\| C₁ \| −0.00799 \| Quantization zero-point \|
	\| C₂ \| 0.00020 \| Fairness bound \|
	""")

	with gr.Tabs():
	with gr.Tab("🔤 Tokenize"):
	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(label="Input Text", lines=5, value=EXAMPLES[0], placeholder="Enter text in any language, code, math, or multimodal format...")
	tokenize_btn = gr.Button("🦴 Tokenize", variant="primary", size="lg")
	gr.Examples(examples=EXAMPLES, inputs=text_input, label="Try these:")
	with gr.Column(scale=3):
	token_vis = gr.HTML(label="Token Visualization")
	with gr.Row():
	stats_out = gr.Markdown()
	compare_out = gr.Markdown()
	with gr.Row():
	ids_out = gr.Markdown()
	mod_out = gr.Markdown()
	tokenize_btn.click(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out])

	with gr.Tab("🔙 Decode"):
	gr.Markdown("### Decode Token IDs → Text")
	ids_input = gr.Textbox(label="Token IDs (comma-separated)", placeholder="72, 4153, 33, 3004, 592", lines=2)
	decode_btn = gr.Button("Decode", variant="primary")
	decoded_text = gr.Textbox(label="Decoded Text", lines=3)
	decoded_vis = gr.HTML(label="Tokens")
	decode_btn.click(decode_ids, inputs=ids_input, outputs=[decoded_text, decoded_vis])

	with gr.Tab("📊 Benchmark"):
	gr.Markdown("### Multilingual Compression Benchmark\nCompression ratio (bytes/token). Higher = better.")
	bench_btn = gr.Button("Run Benchmark", variant="primary")
	bench_out = gr.Markdown()
	bench_btn.click(run_multilingual_benchmark, outputs=bench_out)

	with gr.Tab("📖 Vocabulary"):
	gr.Markdown(f"""### Architecture
	```
	┌──────────────────────────────────────────────────┐
	│ SENTINEL UNIVERSAL TOKENIZER (61,440 tokens) │
	│ │
	│ [0–32] → 33 Special/Control tokens │
	│ [33–32,767] → 32,735 ByteLevel BPE (text) │
	│ [32,768–49,151] → 16,384 Image codebook (VQ) │
	│ [49,152–57,343] → 8,192 Audio codebook (VQ) │
	│ [57,344–61,439] → 4,096 Video codebook (VQ) │
	│ │
	│ Follows 1/e Gradient Axiom scaling │
	└──────────────────────────────────────────────────┘
	```
	Total: {len(sentinel_tok):,} tokens \| Text: 32K \| Image: 16K \| Audio: 8K \| Video: 4K
	""")
	specials_md = "### Special Tokens\n\| Token \| ID \| Purpose \|\n\|:------\|---:\|:--------\|\n"
	for tok_name, purpose in [
	("<pad>","Padding"), ("<unk>","Unknown"), ("<s>","BOS"), ("</s>","EOS"), ("<mask>","MLM"),
	("<image_start>","Image start"), ("<image_end>","Image end"), ("<image>","Image placeholder"),
	("<audio_start>","Audio start"), ("<audio_end>","Audio end"), ("<audio>","Audio placeholder"),
	("<video_start>","Video start"), ("<video_end>","Video end"), ("<video>","Video placeholder"),
	("<sentinel>","Manifold marker"), ("<sentinel_c1>","C₁"), ("<sentinel_c2>","C₂"), ("<scale_1e>","1/e"),
	("<system>","System msg"), ("<user>","User msg"), ("<assistant>","Assistant msg"),
	("<code_start>","Code start"), ("<code_end>","Code end"),
	("<math_start>","Math start"), ("<math_end>","Math end"),
	]:
	tid = sentinel_tok.convert_tokens_to_ids(tok_name)
	specials_md += f"\| `{tok_name}` \| {tid} \| {purpose} \|\n"
	specials_md += f"\n### Codebook Ranges\n\| Modality \| Start \| End \| Size \|\n\|:---------\|------:\|----:\|-----:\|\n"
	specials_md += f"\| 🖼️ Image \| {sentinel_tok.convert_tokens_to_ids('<img_0>')} \| {sentinel_tok.convert_tokens_to_ids('<img_16383>')} \| 16,384 \|\n"
	specials_md += f"\| 🔊 Audio \| {sentinel_tok.convert_tokens_to_ids('<aud_0>')} \| {sentinel_tok.convert_tokens_to_ids('<aud_8191>')} \| 8,192 \|\n"
	specials_md += f"\| 🎬 Video \| {sentinel_tok.convert_tokens_to_ids('<vid_0>')} \| {sentinel_tok.convert_tokens_to_ids('<vid_4095>')} \| 4,096 \|\n"
	gr.Markdown(specials_md)

	with gr.Tab("🧬 About"):
	gr.Markdown(f"""### The Sentinel Manifold

	Function: `F(z) = Σ z^n / n^n` (Sophomore's Dream, Bernoulli 1697)

	Gradient Axiom: `lim F'(z)/F(z) = 1/e ≈ {INV_E:.15f}`

	\| Principle \| Math \| Tokenizer Application \|
	\|:----------\|:-----\|:----------------------\|
	\| 1/e Allocation \| Gradient Axiom \| Modality budget = prev × 1/e \|
	\| sech Scoring \| Bounded \|∂sech/∂x\| ≤ 0.65 \| Dampened BPE merges \|
	\| C₁ = {C1:.6f} \| Attracting fixed point \| Embedding quantization center \|
	\| C₂ = {C2:.6f} \| Escape threshold \| Fertility fairness bound \|

	### Efficiency Champion 🏆

	\| Tokenizer \| Vocab \| Efficiency/1K vocab \|
	\|:----------\|------:\|--------------------:\|
	\| Sentinel \| 61K \| 0.0563 🥇 \|
	\| GPT-2 \| 50K \| 0.0511 \|
	\| Qwen2 \| 152K \| 0.0256 \|
	\| Gemma \| 256K \| 0.0177 \|

	3.2× more efficient per vocab token than Gemma, 2.2× more than Qwen2

	---
	📦 [Model](https://huggingface.co/5dimension/sentinel-universal-tokenizer) · 🦴 [Framework](https://huggingface.co/5dimension/sentinel-manifold-discoveries) · MIT License

	Built by Romain Abdel-Aal (ASI The Sentinel V5.2 Bone-Core)
	""")

	demo.load(tokenize_and_analyze, inputs=text_input, outputs=[token_vis, stats_out, compare_out, ids_out, mod_out])

	if __name__ == "__main__":
	demo.launch()