"""
CTX — Context Transfer Format Demo
Fox Valley AI Foundation | foxfoundation.ai
Thin Gradio UI around the real ctx package.
"""
import asyncio
import time
import gradio as gr
# --- Real CTX imports ---
from ctx.converter.pipeline import convert as ctx_convert
from ctx.parser import parse as ctx_parse
# --- Token counting ---
try:
import tiktoken
_ENC = tiktoken.get_encoding("cl100k_base")
def count_tokens(text: str) -> int:
return len(_ENC.encode(text))
except Exception:
def count_tokens(text: str) -> int:
return max(1, len(text) // 4)
# ---------------------------------------------------------------------------
# Conversion handler
# ---------------------------------------------------------------------------
def convert(url: str, raw_html: str, tier: str) -> tuple:
"""Convert URL or raw HTML to CTX using the real pipeline."""
start = time.time()
try:
if url and url.strip():
url = url.strip()
if not url.startswith("http"):
url = "https://" + url
# Fetch the raw HTML first for metrics comparison
import httpx
resp = httpx.get(url, follow_redirects=True, timeout=20,
headers={"User-Agent": "Mozilla/5.0 (compatible; CTX-Demo/1.0; +https://foxfoundation.ai)"})
resp.raise_for_status()
html = resp.text
# Run the real converter
ctx_output = asyncio.run(ctx_convert(url, tier=tier))
source_url = url
elif raw_html and raw_html.strip():
html = raw_html.strip()
ctx_output = asyncio.run(ctx_convert(html, source_url="https://example.com", tier=tier))
source_url = ""
else:
return ("⚠️ Enter a URL or paste HTML to convert.", "", _empty_metrics())
elapsed = time.time() - start
# Parse the CTX output to extract structured info
try:
doc = ctx_parse(ctx_output)
ref_count = len(doc.refs) if hasattr(doc, 'refs') else 0
title = doc.header.attributes.get("title", "") if hasattr(doc, 'header') else ""
except Exception:
ref_count = ctx_output.count("§ref ")
title = ""
# Metrics
html_bytes = len(html.encode("utf-8"))
ctx_bytes = len(ctx_output.encode("utf-8"))
html_tokens = count_tokens(html)
ctx_tokens = count_tokens(ctx_output)
byte_reduction = ((html_bytes - ctx_bytes) / html_bytes * 100) if html_bytes > 0 else 0
token_reduction = ((html_tokens - ctx_tokens) / html_tokens * 100) if html_tokens > 0 else 0
metrics = _build_metrics(
html_bytes, ctx_bytes, html_tokens, ctx_tokens,
byte_reduction, token_reduction, elapsed, ref_count, tier
)
# HTML preview (truncated)
html_preview = html[:8000]
if len(html) > 8000:
html_preview += f"\n\n... [{len(html) - 8000:,} more characters truncated]"
return (ctx_output, html_preview, metrics)
except Exception as e:
return (f"⚠️ Error: {e}", "", _empty_metrics())
def _empty_metrics():
return """
Waiting for input...
"""
def _build_metrics(html_bytes, ctx_bytes, html_tokens, ctx_tokens,
byte_pct, token_pct, elapsed, ref_count, tier):
def fmt(n):
if n >= 1_000_000:
return f"{n/1_000_000:.1f}M"
elif n >= 1_000:
return f"{n/1_000:.1f}K"
return str(n)
byte_bar = max(3, 100 - byte_pct)
token_bar = max(3, 100 - token_pct)
return f"""
{token_pct:.0f}%
Token Reduction
Bytes
−{byte_pct:.1f}%
{fmt(ctx_bytes)} CTX
{fmt(html_bytes)} HTML
Tokens (cl100k)
−{token_pct:.1f}%
{fmt(ctx_tokens)} CTX
{fmt(html_tokens)} HTML
{fmt(html_tokens)}
HTML tokens
{fmt(ctx_tokens)}
CTX tokens
{elapsed:.2f}s
Convert time
Pipeline
{tier.upper()} tier
{"DOM rules only" if tier == "fast" else "DOM + regex NER" if tier == "smart" else "DOM + NER + VLM"}
Cost @ $3/1M tokens
${html_tokens * 3 / 1_000_000:.4f}
${ctx_tokens * 3 / 1_000_000:.4f}
"""
# ---------------------------------------------------------------------------
# Custom CSS
# ---------------------------------------------------------------------------
CUSTOM_CSS = """
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&family=DM+Sans:ital,wght@0,400;0,500;0,600;0,700;1,400&display=swap');
.gradio-container {
font-family: 'DM Sans', -apple-system, sans-serif !important;
max-width: 1400px !important;
}
.hero-header {
text-align: center;
padding: 32px 20px 24px;
border-bottom: 1px solid #27272a;
margin-bottom: 8px;
}
.hero-header h1 {
font-family: 'JetBrains Mono', monospace !important;
font-size: 42px !important;
font-weight: 800 !important;
letter-spacing: -2px !important;
margin: 0 !important;
background: linear-gradient(135deg, #f97316 0%, #fb923c 40%, #fbbf24 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
line-height: 1.1 !important;
}
.hero-sub {
font-size: 15px;
color: #a1a1aa;
margin-top: 8px;
letter-spacing: 0.3px;
}
.hero-links {
margin-top: 14px;
font-size: 12px;
color: #71717a;
letter-spacing: 0.5px;
}
.hero-links a {
color: #f97316 !important;
text-decoration: none;
}
.hero-links a:hover {
text-decoration: underline;
}
.hero-badge {
display: inline-block;
background: #27272a;
border: 1px solid #3f3f46;
border-radius: 100px;
padding: 4px 14px;
font-size: 11px;
color: #a1a1aa;
letter-spacing: 1px;
text-transform: uppercase;
margin-bottom: 16px;
font-family: 'JetBrains Mono', monospace;
}
.ctx-output textarea {
font-family: 'JetBrains Mono', monospace !important;
font-size: 12px !important;
line-height: 1.6 !important;
background: #0a0a0a !important;
border: 1px solid #27272a !important;
}
.html-preview textarea {
font-family: 'JetBrains Mono', monospace !important;
font-size: 11px !important;
line-height: 1.5 !important;
color: #71717a !important;
background: #0a0a0a !important;
border: 1px solid #27272a !important;
}
.convert-btn {
background: linear-gradient(135deg, #f97316, #ea580c) !important;
color: white !important;
font-family: 'JetBrains Mono', monospace !important;
font-weight: 700 !important;
font-size: 14px !important;
letter-spacing: 1px !important;
text-transform: uppercase !important;
border: none !important;
border-radius: 8px !important;
padding: 12px 32px !important;
min-height: 48px !important;
transition: all 0.2s ease !important;
}
.convert-btn:hover {
transform: translateY(-1px) !important;
box-shadow: 0 4px 20px rgba(249,115,22,0.4) !important;
}
.section-label {
font-family: 'JetBrains Mono', monospace;
font-size: 10px;
letter-spacing: 1.5px;
text-transform: uppercase;
color: #52525b;
padding: 12px 0 4px;
}
.metrics-panel {
border: 1px solid #27272a;
border-radius: 12px;
padding: 20px;
background: #09090b;
}
.format-example {
background: #0a0a0a;
border: 1px solid #27272a;
border-radius: 8px;
padding: 16px;
font-family: 'JetBrains Mono', monospace;
font-size: 12px;
line-height: 1.6;
color: #d4d4d8;
white-space: pre-wrap;
margin: 12px 0;
}
.pipeline-info {
background: #0a0a0a;
border: 1px solid #27272a;
border-radius: 8px;
padding: 14px 16px;
font-family: 'JetBrains Mono', monospace;
font-size: 11px;
line-height: 1.7;
color: #71717a;
margin: 8px 0;
}
.pipeline-info span.stage {
color: #f97316;
font-weight: 600;
}
.pipeline-info span.arrow {
color: #3f3f46;
}
"""
# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------
HEADER_HTML = """
"""
FORMAT_EXAMPLE = """§doc.ctx_v1.0 url=example.com/article title="Article" †type =article †lang =en
§nav [skip]
Main menu...
§content.article
§1 Introduction
§p Content with citations [ref1] preserved...
§2 Subsection
§p Hierarchy intact, zero wasted tokens.
§footer [skip]
Copyright...
§ref id=ref1 url=example.com title="Source" †rel =related
"""
PIPELINE_HTML = """
Fetch →
Extract →
Classify →
Annotate →
Normalize →
Emit →
§ CTX
Full spec-compliant pipeline · readability + BeautifulSoup DOM · inline citations · skip annotations · section nesting
"""
EXAMPLE_HTML = """
Sample Article
Home | Blog | About
Why Token Efficiency Matters
Every page an LLM reads costs tokens. A typical web page contains
thousands of tokens of
navigation, scripts, and styling that contribute nothing to understanding.
The Problem
Raw HTML wastes 80-95% of context window capacity on structural markup,
CSS classes , and
JavaScript that an LLM cannot execute.
The Solution
CTX strips everything an LLM doesn't need while preserving content, citations,
and hierarchy. The result: the same information in a fraction of the tokens.
Format Tokens Savings
Raw HTML 45,000 baseline
Markdown 4,500 90%
CTX 3,400 92%
"""
EXAMPLES = [
["https://en.wikipedia.org/wiki/Large_language_model", "", "smart"],
["https://en.wikipedia.org/wiki/Shohei_Ohtani", "", "smart"],
["https://en.wikipedia.org/wiki/Python_(programming_language)", "", "fast"],
["https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)", "", "smart"],
["https://news.ycombinator.com", "", "fast"],
["", EXAMPLE_HTML, "smart"],
]
# ---------------------------------------------------------------------------
# Build the app
# ---------------------------------------------------------------------------
try:
_theme = gr.themes.Base(
primary_hue="orange",
neutral_hue="zinc",
font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"],
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"],
)
except Exception:
_theme = gr.themes.Base()
with gr.Blocks(css=CUSTOM_CSS, theme=_theme) as demo:
gr.HTML(HEADER_HTML)
with gr.Row():
with gr.Column(scale=7):
gr.HTML('→ Input
')
with gr.Tab("URL"):
url_input = gr.Textbox(
label="Web page URL",
placeholder="https://en.wikipedia.org/wiki/Large_language_model",
lines=1,
max_lines=1,
)
with gr.Tab("Paste HTML"):
html_input = gr.Textbox(
label="Raw HTML",
placeholder="...",
lines=6,
max_lines=12,
)
with gr.Row():
tier_input = gr.Radio(
choices=["fast", "smart"],
value="smart",
label="Extraction Tier",
info="fast = DOM rules only (<500ms) · smart = DOM + regex NER (<1s)",
)
convert_btn = gr.Button(
"⚡ Convert to CTX",
elem_classes=["convert-btn"],
variant="primary",
)
with gr.Column(scale=3):
gr.HTML('→ What is CTX?
')
gr.HTML(FORMAT_EXAMPLE)
gr.HTML(PIPELINE_HTML)
gr.HTML('→ Results
')
with gr.Row():
with gr.Column(scale=5):
ctx_output = gr.Textbox(
label="§ CTX Output",
lines=24,
max_lines=50,
elem_classes=["ctx-output"],
interactive=False,
)
with gr.Column(scale=2):
metrics_output = gr.HTML(
value=_empty_metrics(),
label="Metrics",
elem_classes=["metrics-panel"],
)
with gr.Accordion("Raw HTML (truncated)", open=False):
html_preview = gr.Textbox(
label="Source HTML",
lines=12,
max_lines=20,
elem_classes=["html-preview"],
interactive=False,
)
gr.HTML('→ Try these
')
gr.Examples(
examples=EXAMPLES,
inputs=[url_input, html_input, tier_input],
label="",
)
convert_btn.click(
fn=convert,
inputs=[url_input, html_input, tier_input],
outputs=[ctx_output, html_preview, metrics_output],
)
url_input.submit(
fn=convert,
inputs=[url_input, html_input, tier_input],
outputs=[ctx_output, html_preview, metrics_output],
)
if __name__ == "__main__":
demo.launch()