ctx-demo / app.py
mtecnic's picture
Rename app (1).py to app.py
f958aed verified
"""
CTX — Context Transfer Format Demo
Fox Valley AI Foundation | foxfoundation.ai
Thin Gradio UI around the real ctx package.
"""
import asyncio
import time
import gradio as gr
# --- Real CTX imports ---
from ctx.converter.pipeline import convert as ctx_convert
from ctx.parser import parse as ctx_parse
# --- Token counting ---
try:
import tiktoken
_ENC = tiktoken.get_encoding("cl100k_base")
def count_tokens(text: str) -> int:
return len(_ENC.encode(text))
except Exception:
def count_tokens(text: str) -> int:
return max(1, len(text) // 4)
# ---------------------------------------------------------------------------
# Conversion handler
# ---------------------------------------------------------------------------
def convert(url: str, raw_html: str, tier: str) -> tuple:
"""Convert URL or raw HTML to CTX using the real pipeline."""
start = time.time()
try:
if url and url.strip():
url = url.strip()
if not url.startswith("http"):
url = "https://" + url
# Fetch the raw HTML first for metrics comparison
import httpx
resp = httpx.get(url, follow_redirects=True, timeout=20,
headers={"User-Agent": "Mozilla/5.0 (compatible; CTX-Demo/1.0; +https://foxfoundation.ai)"})
resp.raise_for_status()
html = resp.text
# Run the real converter
ctx_output = asyncio.run(ctx_convert(url, tier=tier))
source_url = url
elif raw_html and raw_html.strip():
html = raw_html.strip()
ctx_output = asyncio.run(ctx_convert(html, source_url="https://example.com", tier=tier))
source_url = ""
else:
return ("⚠️ Enter a URL or paste HTML to convert.", "", _empty_metrics())
elapsed = time.time() - start
# Parse the CTX output to extract structured info
try:
doc = ctx_parse(ctx_output)
ref_count = len(doc.refs) if hasattr(doc, 'refs') else 0
title = doc.header.attributes.get("title", "") if hasattr(doc, 'header') else ""
except Exception:
ref_count = ctx_output.count("§ref ")
title = ""
# Metrics
html_bytes = len(html.encode("utf-8"))
ctx_bytes = len(ctx_output.encode("utf-8"))
html_tokens = count_tokens(html)
ctx_tokens = count_tokens(ctx_output)
byte_reduction = ((html_bytes - ctx_bytes) / html_bytes * 100) if html_bytes > 0 else 0
token_reduction = ((html_tokens - ctx_tokens) / html_tokens * 100) if html_tokens > 0 else 0
metrics = _build_metrics(
html_bytes, ctx_bytes, html_tokens, ctx_tokens,
byte_reduction, token_reduction, elapsed, ref_count, tier
)
# HTML preview (truncated)
html_preview = html[:8000]
if len(html) > 8000:
html_preview += f"\n\n... [{len(html) - 8000:,} more characters truncated]"
return (ctx_output, html_preview, metrics)
except Exception as e:
return (f"⚠️ Error: {e}", "", _empty_metrics())
def _empty_metrics():
return """<div style="text-align:center; padding:48px; color:#888;
font-family:'JetBrains Mono',monospace; font-size:14px;">
Waiting for input...
</div>"""
def _build_metrics(html_bytes, ctx_bytes, html_tokens, ctx_tokens,
byte_pct, token_pct, elapsed, ref_count, tier):
def fmt(n):
if n >= 1_000_000:
return f"{n/1_000_000:.1f}M"
elif n >= 1_000:
return f"{n/1_000:.1f}K"
return str(n)
byte_bar = max(3, 100 - byte_pct)
token_bar = max(3, 100 - token_pct)
return f"""
<div style="font-family: 'JetBrains Mono', 'Fira Code', monospace; padding: 8px 0;">
<div style="text-align:center; margin-bottom:28px;">
<div style="font-size:64px; font-weight:800; letter-spacing:-3px;
background: linear-gradient(135deg, #f97316 0%, #fb923c 50%, #fbbf24 100%);
-webkit-background-clip:text; -webkit-text-fill-color:transparent;
line-height:1;">
{token_pct:.0f}%
</div>
<div style="font-size:13px; color:#a1a1aa; margin-top:4px; letter-spacing:1px; text-transform:uppercase;">
Token Reduction
</div>
</div>
<!-- Byte comparison -->
<div style="margin-bottom:20px;">
<div style="display:flex; justify-content:space-between; font-size:11px; color:#71717a; margin-bottom:6px; text-transform:uppercase; letter-spacing:0.5px;">
<span>Bytes</span>
<span>−{byte_pct:.1f}%</span>
</div>
<div style="position:relative; height:32px; border-radius:6px; overflow:hidden; background:#27272a;">
<div style="position:absolute; top:0; left:0; height:100%; width:100%;
background:#3f3f46; border-radius:6px;"></div>
<div style="position:absolute; top:0; left:0; height:100%; width:{byte_bar}%;
background: linear-gradient(90deg, #f97316, #fb923c);
border-radius:6px; transition: width 0.8s ease;"></div>
<div style="position:absolute; top:0; left:0; height:100%; width:100%;
display:flex; align-items:center; justify-content:space-between; padding:0 10px;">
<span style="font-size:11px; color:white; font-weight:600; text-shadow:0 1px 2px rgba(0,0,0,0.5);">{fmt(ctx_bytes)} CTX</span>
<span style="font-size:11px; color:#a1a1aa;">{fmt(html_bytes)} HTML</span>
</div>
</div>
</div>
<!-- Token comparison -->
<div style="margin-bottom:20px;">
<div style="display:flex; justify-content:space-between; font-size:11px; color:#71717a; margin-bottom:6px; text-transform:uppercase; letter-spacing:0.5px;">
<span>Tokens (cl100k)</span>
<span>−{token_pct:.1f}%</span>
</div>
<div style="position:relative; height:32px; border-radius:6px; overflow:hidden; background:#27272a;">
<div style="position:absolute; top:0; left:0; height:100%; width:100%;
background:#3f3f46; border-radius:6px;"></div>
<div style="position:absolute; top:0; left:0; height:100%; width:{token_bar}%;
background: linear-gradient(90deg, #3b82f6, #60a5fa);
border-radius:6px; transition: width 0.8s ease;"></div>
<div style="position:absolute; top:0; left:0; height:100%; width:100%;
display:flex; align-items:center; justify-content:space-between; padding:0 10px;">
<span style="font-size:11px; color:white; font-weight:600; text-shadow:0 1px 2px rgba(0,0,0,0.5);">{fmt(ctx_tokens)} CTX</span>
<span style="font-size:11px; color:#a1a1aa;">{fmt(html_tokens)} HTML</span>
</div>
</div>
</div>
<!-- Stats grid -->
<div style="display:grid; grid-template-columns:1fr 1fr; gap:12px; margin-top:24px;">
<div style="background:#18181b; border:1px solid #27272a; border-radius:8px; padding:14px; text-align:center;">
<div style="font-size:20px; font-weight:700; color:#f97316;">{fmt(html_tokens)}</div>
<div style="font-size:10px; color:#71717a; margin-top:2px; text-transform:uppercase; letter-spacing:0.5px;">HTML tokens</div>
</div>
<div style="background:#18181b; border:1px solid #27272a; border-radius:8px; padding:14px; text-align:center;">
<div style="font-size:20px; font-weight:700; color:#3b82f6;">{fmt(ctx_tokens)}</div>
<div style="font-size:10px; color:#71717a; margin-top:2px; text-transform:uppercase; letter-spacing:0.5px;">CTX tokens</div>
</div>
<div style="background:#18181b; border:1px solid #27272a; border-radius:8px; padding:14px; text-align:center;">
<div style="font-size:20px; font-weight:700; color:#22c55e;">{elapsed:.2f}s</div>
<div style="font-size:10px; color:#71717a; margin-top:2px; text-transform:uppercase; letter-spacing:0.5px;">Convert time</div>
</div>
<div style="background:#18181b; border:1px solid #27272a; border-radius:8px; padding:14px; text-align:center;">
<div style="font-size:20px; font-weight:700; color:#a78bfa;">{ref_count}</div>
<div style="font-size:10px; color:#71717a; margin-top:2px; text-transform:uppercase; letter-spacing:0.5px;">Citations</div>
</div>
</div>
<!-- Tier + Cost -->
<div style="margin-top:16px; display:grid; grid-template-columns:1fr 1fr; gap:12px;">
<div style="background:#18181b; border:1px solid #27272a; border-radius:8px; padding:14px;">
<div style="font-size:10px; color:#71717a; text-transform:uppercase; letter-spacing:1px; margin-bottom:6px;">
Pipeline
</div>
<div style="font-size:14px; color:#fbbf24; font-weight:600;">
{tier.upper()} tier
</div>
<div style="font-size:10px; color:#52525b; margin-top:2px;">
{"DOM rules only" if tier == "fast" else "DOM + regex NER" if tier == "smart" else "DOM + NER + VLM"}
</div>
</div>
<div style="background:#18181b; border:1px solid #27272a; border-radius:8px; padding:14px;">
<div style="font-size:10px; color:#71717a; text-transform:uppercase; letter-spacing:1px; margin-bottom:6px;">
Cost @ $3/1M tokens
</div>
<div style="display:flex; justify-content:space-between; align-items:baseline;">
<span style="font-size:12px; color:#ef4444; text-decoration:line-through;">${html_tokens * 3 / 1_000_000:.4f}</span>
<span style="font-size:14px; color:#22c55e; font-weight:600;">${ctx_tokens * 3 / 1_000_000:.4f}</span>
</div>
</div>
</div>
</div>
"""
# ---------------------------------------------------------------------------
# Custom CSS
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&family=DM+Sans:ital,wght@0,400;0,500;0,600;0,700;1,400&display=swap');
.gradio-container {
font-family: 'DM Sans', -apple-system, sans-serif !important;
max-width: 1400px !important;
}
.hero-header {
text-align: center;
padding: 32px 20px 24px;
border-bottom: 1px solid #27272a;
margin-bottom: 8px;
}
.hero-header h1 {
font-family: 'JetBrains Mono', monospace !important;
font-size: 42px !important;
font-weight: 800 !important;
letter-spacing: -2px !important;
margin: 0 !important;
background: linear-gradient(135deg, #f97316 0%, #fb923c 40%, #fbbf24 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
line-height: 1.1 !important;
}
.hero-sub {
font-size: 15px;
color: #a1a1aa;
margin-top: 8px;
letter-spacing: 0.3px;
}
.hero-links {
margin-top: 14px;
font-size: 12px;
color: #71717a;
letter-spacing: 0.5px;
}
.hero-links a {
color: #f97316 !important;
text-decoration: none;
}
.hero-links a:hover {
text-decoration: underline;
}
.hero-badge {
display: inline-block;
background: #27272a;
border: 1px solid #3f3f46;
border-radius: 100px;
padding: 4px 14px;
font-size: 11px;
color: #a1a1aa;
letter-spacing: 1px;
text-transform: uppercase;
margin-bottom: 16px;
font-family: 'JetBrains Mono', monospace;
}
.ctx-output textarea {
font-family: 'JetBrains Mono', monospace !important;
font-size: 12px !important;
line-height: 1.6 !important;
background: #0a0a0a !important;
border: 1px solid #27272a !important;
}
.html-preview textarea {
font-family: 'JetBrains Mono', monospace !important;
font-size: 11px !important;
line-height: 1.5 !important;
color: #71717a !important;
background: #0a0a0a !important;
border: 1px solid #27272a !important;
}
.convert-btn {
background: linear-gradient(135deg, #f97316, #ea580c) !important;
color: white !important;
font-family: 'JetBrains Mono', monospace !important;
font-weight: 700 !important;
font-size: 14px !important;
letter-spacing: 1px !important;
text-transform: uppercase !important;
border: none !important;
border-radius: 8px !important;
padding: 12px 32px !important;
min-height: 48px !important;
transition: all 0.2s ease !important;
}
.convert-btn:hover {
transform: translateY(-1px) !important;
box-shadow: 0 4px 20px rgba(249,115,22,0.4) !important;
}
.section-label {
font-family: 'JetBrains Mono', monospace;
font-size: 10px;
letter-spacing: 1.5px;
text-transform: uppercase;
color: #52525b;
padding: 12px 0 4px;
}
.metrics-panel {
border: 1px solid #27272a;
border-radius: 12px;
padding: 20px;
background: #09090b;
}
.format-example {
background: #0a0a0a;
border: 1px solid #27272a;
border-radius: 8px;
padding: 16px;
font-family: 'JetBrains Mono', monospace;
font-size: 12px;
line-height: 1.6;
color: #d4d4d8;
white-space: pre-wrap;
margin: 12px 0;
}
.pipeline-info {
background: #0a0a0a;
border: 1px solid #27272a;
border-radius: 8px;
padding: 14px 16px;
font-family: 'JetBrains Mono', monospace;
font-size: 11px;
line-height: 1.7;
color: #71717a;
margin: 8px 0;
}
.pipeline-info span.stage {
color: #f97316;
font-weight: 600;
}
.pipeline-info span.arrow {
color: #3f3f46;
}
"""
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
HEADER_HTML = """
<div class="hero-header">
<div class="hero-badge">Fox Valley AI Foundation</div>
<h1>§ CTX</h1>
<div class="hero-sub">
Context Transfer Format — the content layer between the web and AI<br/>
<strong style="color:#d4d4d8;">87–90% token reduction</strong> on real-world web pages
</div>
<div class="hero-links">
<a href="https://github.com/mtecnic/ctx" target="_blank">GitHub</a> ·
<a href="https://github.com/mtecnic/ctx/blob/main/specification.md" target="_blank">Spec v1.0</a> ·
<a href="https://foxfoundation.ai" target="_blank">foxfoundation.ai</a> ·
<a href="https://x.com/NicW_AI" target="_blank">@NicW_AI</a>
</div>
</div>
"""
FORMAT_EXAMPLE = """<div class="format-example"><span style="color:#f97316;">§doc.ctx_v1.0</span> url=example.com/article title="Article" <span style="color:#60a5fa;">†type</span>=article <span style="color:#60a5fa;">†lang</span>=en
<span style="color:#52525b;">§nav [skip]
Main menu...</span>
<span style="color:#f97316;">§content.article</span>
<span style="color:#f97316;">§1</span> Introduction
<span style="color:#f97316;">§p</span> Content with citations <span style="color:#22c55e;">[ref1]</span> preserved...
<span style="color:#f97316;">§2</span> Subsection
<span style="color:#f97316;">§p</span> Hierarchy intact, zero wasted tokens.
<span style="color:#52525b;">§footer [skip]
Copyright...</span>
<span style="color:#22c55e;">§ref</span> id=ref1 url=example.com title="Source" <span style="color:#60a5fa;">†rel</span>=related</div>"""
PIPELINE_HTML = """<div class="pipeline-info">
<span class="stage">Fetch</span> <span class="arrow">→</span>
<span class="stage">Extract</span> <span class="arrow">→</span>
<span class="stage">Classify</span> <span class="arrow">→</span>
<span class="stage">Annotate</span> <span class="arrow">→</span>
<span class="stage">Normalize</span> <span class="arrow">→</span>
<span class="stage">Emit</span> <span class="arrow">→</span>
<span style="color:#fbbf24; font-weight:700;">§ CTX</span>
<br/><span style="color:#52525b;">Full spec-compliant pipeline · readability + BeautifulSoup DOM · inline citations · skip annotations · section nesting</span>
</div>"""
EXAMPLE_HTML = """<html lang="en">
<head><title>Sample Article</title></head>
<body>
<nav><a href="/">Home</a> | <a href="/blog">Blog</a> | <a href="/about">About</a></nav>
<article>
<h1>Why Token Efficiency Matters</h1>
<p>Every page an LLM reads costs tokens. A typical web page contains
<a href="https://example.com/html-bloat">thousands of tokens</a> of
navigation, scripts, and styling that contribute nothing to understanding.</p>
<h2>The Problem</h2>
<p>Raw HTML wastes 80-95% of context window capacity on structural markup,
<a href="https://example.com/css">CSS classes</a>, and
<a href="https://example.com/js">JavaScript</a> that an LLM cannot execute.</p>
<h2>The Solution</h2>
<p>CTX strips everything an LLM doesn't need while preserving content, citations,
and hierarchy. The result: the same information in a fraction of the tokens.</p>
<table>
<tr><th>Format</th><th>Tokens</th><th>Savings</th></tr>
<tr><td>Raw HTML</td><td>45,000</td><td>baseline</td></tr>
<tr><td>Markdown</td><td>4,500</td><td>90%</td></tr>
<tr><td>CTX</td><td>3,400</td><td>92%</td></tr>
</table>
</article>
<aside class="sidebar"><h3>Related Posts</h3><ul><li>Post 1</li><li>Post 2</li></ul></aside>
<footer><p>© 2026 Example Inc. All rights reserved. Privacy Policy | Terms</p></footer>
</body>
</html>"""
EXAMPLES = [
["https://en.wikipedia.org/wiki/Large_language_model", "", "smart"],
["https://en.wikipedia.org/wiki/Shohei_Ohtani", "", "smart"],
["https://en.wikipedia.org/wiki/Python_(programming_language)", "", "fast"],
["https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)", "", "smart"],
["https://news.ycombinator.com", "", "fast"],
["", EXAMPLE_HTML, "smart"],
]
# ---------------------------------------------------------------------------
# Build the app
# ---------------------------------------------------------------------------
try:
_theme = gr.themes.Base(
primary_hue="orange",
neutral_hue="zinc",
font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
)
except Exception:
_theme = gr.themes.Base()
with gr.Blocks(css=CUSTOM_CSS, theme=_theme) as demo:
gr.HTML(HEADER_HTML)
with gr.Row():
with gr.Column(scale=7):
gr.HTML('<div class="section-label">→ Input</div>')
with gr.Tab("URL"):
url_input = gr.Textbox(
label="Web page URL",
placeholder="https://en.wikipedia.org/wiki/Large_language_model",
lines=1,
max_lines=1,
)
with gr.Tab("Paste HTML"):
html_input = gr.Textbox(
label="Raw HTML",
placeholder="<html>...</html>",
lines=6,
max_lines=12,
)
with gr.Row():
tier_input = gr.Radio(
choices=["fast", "smart"],
value="smart",
label="Extraction Tier",
info="fast = DOM rules only (<500ms) · smart = DOM + regex NER (<1s)",
)
convert_btn = gr.Button(
"⚡ Convert to CTX",
elem_classes=["convert-btn"],
variant="primary",
)
with gr.Column(scale=3):
gr.HTML('<div class="section-label">→ What is CTX?</div>')
gr.HTML(FORMAT_EXAMPLE)
gr.HTML(PIPELINE_HTML)
gr.HTML('<div class="section-label" style="margin-top:8px;">→ Results</div>')
with gr.Row():
with gr.Column(scale=5):
ctx_output = gr.Textbox(
label="§ CTX Output",
lines=24,
max_lines=50,
elem_classes=["ctx-output"],
interactive=False,
)
with gr.Column(scale=2):
metrics_output = gr.HTML(
value=_empty_metrics(),
label="Metrics",
elem_classes=["metrics-panel"],
)
with gr.Accordion("Raw HTML (truncated)", open=False):
html_preview = gr.Textbox(
label="Source HTML",
lines=12,
max_lines=20,
elem_classes=["html-preview"],
interactive=False,
)
gr.HTML('<div class="section-label" style="margin-top:4px;">→ Try these</div>')
gr.Examples(
examples=EXAMPLES,
inputs=[url_input, html_input, tier_input],
label="",
)
convert_btn.click(
fn=convert,
inputs=[url_input, html_input, tier_input],
outputs=[ctx_output, html_preview, metrics_output],
)
url_input.submit(
fn=convert,
inputs=[url_input, html_input, tier_input],
outputs=[ctx_output, html_preview, metrics_output],
)
if __name__ == "__main__":
demo.launch()