""" CTX — Context Transfer Format Demo Fox Valley AI Foundation | foxfoundation.ai Thin Gradio UI around the real ctx package. """ import asyncio import time import gradio as gr # --- Real CTX imports --- from ctx.converter.pipeline import convert as ctx_convert from ctx.parser import parse as ctx_parse # --- Token counting --- try: import tiktoken _ENC = tiktoken.get_encoding("cl100k_base") def count_tokens(text: str) -> int: return len(_ENC.encode(text)) except Exception: def count_tokens(text: str) -> int: return max(1, len(text) // 4) # --------------------------------------------------------------------------- # Conversion handler # --------------------------------------------------------------------------- def convert(url: str, raw_html: str, tier: str) -> tuple: """Convert URL or raw HTML to CTX using the real pipeline.""" start = time.time() try: if url and url.strip(): url = url.strip() if not url.startswith("http"): url = "https://" + url # Fetch the raw HTML first for metrics comparison import httpx resp = httpx.get(url, follow_redirects=True, timeout=20, headers={"User-Agent": "Mozilla/5.0 (compatible; CTX-Demo/1.0; +https://foxfoundation.ai)"}) resp.raise_for_status() html = resp.text # Run the real converter ctx_output = asyncio.run(ctx_convert(url, tier=tier)) source_url = url elif raw_html and raw_html.strip(): html = raw_html.strip() ctx_output = asyncio.run(ctx_convert(html, source_url="https://example.com", tier=tier)) source_url = "" else: return ("⚠️ Enter a URL or paste HTML to convert.", "", _empty_metrics()) elapsed = time.time() - start # Parse the CTX output to extract structured info try: doc = ctx_parse(ctx_output) ref_count = len(doc.refs) if hasattr(doc, 'refs') else 0 title = doc.header.attributes.get("title", "") if hasattr(doc, 'header') else "" except Exception: ref_count = ctx_output.count("§ref ") title = "" # Metrics html_bytes = len(html.encode("utf-8")) ctx_bytes = len(ctx_output.encode("utf-8")) html_tokens = count_tokens(html) ctx_tokens = count_tokens(ctx_output) byte_reduction = ((html_bytes - ctx_bytes) / html_bytes * 100) if html_bytes > 0 else 0 token_reduction = ((html_tokens - ctx_tokens) / html_tokens * 100) if html_tokens > 0 else 0 metrics = _build_metrics( html_bytes, ctx_bytes, html_tokens, ctx_tokens, byte_reduction, token_reduction, elapsed, ref_count, tier ) # HTML preview (truncated) html_preview = html[:8000] if len(html) > 8000: html_preview += f"\n\n... [{len(html) - 8000:,} more characters truncated]" return (ctx_output, html_preview, metrics) except Exception as e: return (f"⚠️ Error: {e}", "", _empty_metrics()) def _empty_metrics(): return """

        Waiting for input...
    

""" def _build_metrics(html_bytes, ctx_bytes, html_tokens, ctx_tokens, byte_pct, token_pct, elapsed, ref_count, tier): def fmt(n): if n >= 1_000_000: return f"{n/1_000_000:.1f}M" elif n >= 1_000: return f"{n/1_000:.1f}K" return str(n) byte_bar = max(3, 100 - byte_pct) token_bar = max(3, 100 - token_pct) return f"""

          {token_pct:.0f}%
        
          Token Reduction
        
          Bytes
          −{byte_pct:.1f}%
        
            {fmt(ctx_bytes)} CTX
            {fmt(html_bytes)} HTML
          
          Tokens (cl100k)
          −{token_pct:.1f}%
        
            {fmt(ctx_tokens)} CTX
            {fmt(html_tokens)} HTML
          
{fmt(html_tokens)}
HTML tokens
{fmt(ctx_tokens)}
CTX tokens
{elapsed:.2f}s
Convert time
{ref_count}
Citations

            Pipeline
          
            {tier.upper()} tier
          
            {"DOM rules only" if tier == "fast" else "DOM + regex NER" if tier == "smart" else "DOM + NER + VLM"}
          
            Cost @ $3/1M tokens
          
            ${html_tokens * 3 / 1_000_000:.4f}
            ${ctx_tokens * 3 / 1_000_000:.4f}

""" # --------------------------------------------------------------------------- # Custom CSS # --------------------------------------------------------------------------- CUSTOM_CSS = """ @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;500;600;700;800&family=DM+Sans:ital,wght@0,400;0,500;0,600;0,700;1,400&display=swap'); .gradio-container { font-family: 'DM Sans', -apple-system, sans-serif !important; max-width: 1400px !important; } .hero-header { text-align: center; padding: 32px 20px 24px; border-bottom: 1px solid #27272a; margin-bottom: 8px; } .hero-header h1 { font-family: 'JetBrains Mono', monospace !important; font-size: 42px !important; font-weight: 800 !important; letter-spacing: -2px !important; margin: 0 !important; background: linear-gradient(135deg, #f97316 0%, #fb923c 40%, #fbbf24 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; line-height: 1.1 !important; } .hero-sub { font-size: 15px; color: #a1a1aa; margin-top: 8px; letter-spacing: 0.3px; } .hero-links { margin-top: 14px; font-size: 12px; color: #71717a; letter-spacing: 0.5px; } .hero-links a { color: #f97316 !important; text-decoration: none; } .hero-links a:hover { text-decoration: underline; } .hero-badge { display: inline-block; background: #27272a; border: 1px solid #3f3f46; border-radius: 100px; padding: 4px 14px; font-size: 11px; color: #a1a1aa; letter-spacing: 1px; text-transform: uppercase; margin-bottom: 16px; font-family: 'JetBrains Mono', monospace; } .ctx-output textarea { font-family: 'JetBrains Mono', monospace !important; font-size: 12px !important; line-height: 1.6 !important; background: #0a0a0a !important; border: 1px solid #27272a !important; } .html-preview textarea { font-family: 'JetBrains Mono', monospace !important; font-size: 11px !important; line-height: 1.5 !important; color: #71717a !important; background: #0a0a0a !important; border: 1px solid #27272a !important; } .convert-btn { background: linear-gradient(135deg, #f97316, #ea580c) !important; color: white !important; font-family: 'JetBrains Mono', monospace !important; font-weight: 700 !important; font-size: 14px !important; letter-spacing: 1px !important; text-transform: uppercase !important; border: none !important; border-radius: 8px !important; padding: 12px 32px !important; min-height: 48px !important; transition: all 0.2s ease !important; } .convert-btn:hover { transform: translateY(-1px) !important; box-shadow: 0 4px 20px rgba(249,115,22,0.4) !important; } .section-label { font-family: 'JetBrains Mono', monospace; font-size: 10px; letter-spacing: 1.5px; text-transform: uppercase; color: #52525b; padding: 12px 0 4px; } .metrics-panel { border: 1px solid #27272a; border-radius: 12px; padding: 20px; background: #09090b; } .format-example { background: #0a0a0a; border: 1px solid #27272a; border-radius: 8px; padding: 16px; font-family: 'JetBrains Mono', monospace; font-size: 12px; line-height: 1.6; color: #d4d4d8; white-space: pre-wrap; margin: 12px 0; } .pipeline-info { background: #0a0a0a; border: 1px solid #27272a; border-radius: 8px; padding: 14px 16px; font-family: 'JetBrains Mono', monospace; font-size: 11px; line-height: 1.7; color: #71717a; margin: 8px 0; } .pipeline-info span.stage { color: #f97316; font-weight: 600; } .pipeline-info span.arrow { color: #3f3f46; } """ # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- HEADER_HTML = """

Fox Valley AI Foundation

§ CTX

Context Transfer Format — the content layer between the web and AI
87–90% token reduction on real-world web pages

GitHub · Spec v1.0 · foxfoundation.ai · @NicW_AI

""" FORMAT_EXAMPLE = """

§doc.ctx_v1.0 url=example.com/article title="Article" †type=article †lang=en §nav [skip] Main menu... §content.article §1 Introduction §p Content with citations [ref1] preserved... §2 Subsection §p Hierarchy intact, zero wasted tokens. §footer [skip] Copyright... §ref id=ref1 url=example.com title="Source" †rel=related

""" PIPELINE_HTML = """

Fetch → Extract → Classify → Annotate → Normalize → Emit → § CTX
Full spec-compliant pipeline · readability + BeautifulSoup DOM · inline citations · skip annotations · section nesting

""" EXAMPLE_HTML = """ Sample Article

Why Token Efficiency Matters

Every page an LLM reads costs tokens. A typical web page contains thousands of tokens of navigation, scripts, and styling that contribute nothing to understanding.

The Problem

Raw HTML wastes 80-95% of context window capacity on structural markup, CSS classes, and JavaScript that an LLM cannot execute.

The Solution

CTX strips everything an LLM doesn't need while preserving content, citations, and hierarchy. The result: the same information in a fraction of the tokens.

Format	Tokens	Savings
Raw HTML	45,000	baseline
Markdown	4,500	90%
CTX	3,400	92%

""" EXAMPLES = [ ["https://en.wikipedia.org/wiki/Large_language_model", "", "smart"], ["https://en.wikipedia.org/wiki/Shohei_Ohtani", "", "smart"], ["https://en.wikipedia.org/wiki/Python_(programming_language)", "", "fast"], ["https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)", "", "smart"], ["https://news.ycombinator.com", "", "fast"], ["", EXAMPLE_HTML, "smart"], ] # --------------------------------------------------------------------------- # Build the app # --------------------------------------------------------------------------- try: _theme = gr.themes.Base( primary_hue="orange", neutral_hue="zinc", font=[gr.themes.GoogleFont("DM Sans"), "sans-serif"], font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "monospace"], ) except Exception: _theme = gr.themes.Base() with gr.Blocks(css=CUSTOM_CSS, theme=_theme) as demo: gr.HTML(HEADER_HTML) with gr.Row(): with gr.Column(scale=7): gr.HTML('

→ Input

') with gr.Tab("URL"): url_input = gr.Textbox( label="Web page URL", placeholder="https://en.wikipedia.org/wiki/Large_language_model", lines=1, max_lines=1, ) with gr.Tab("Paste HTML"): html_input = gr.Textbox( label="Raw HTML", placeholder="...", lines=6, max_lines=12, ) with gr.Row(): tier_input = gr.Radio( choices=["fast", "smart"], value="smart", label="Extraction Tier", info="fast = DOM rules only (<500ms) · smart = DOM + regex NER (<1s)", ) convert_btn = gr.Button( "⚡ Convert to CTX", elem_classes=["convert-btn"], variant="primary", ) with gr.Column(scale=3): gr.HTML('

→ What is CTX?

') gr.HTML(FORMAT_EXAMPLE) gr.HTML(PIPELINE_HTML) gr.HTML('

→ Results

') with gr.Row(): with gr.Column(scale=5): ctx_output = gr.Textbox( label="§ CTX Output", lines=24, max_lines=50, elem_classes=["ctx-output"], interactive=False, ) with gr.Column(scale=2): metrics_output = gr.HTML( value=_empty_metrics(), label="Metrics", elem_classes=["metrics-panel"], ) with gr.Accordion("Raw HTML (truncated)", open=False): html_preview = gr.Textbox( label="Source HTML", lines=12, max_lines=20, elem_classes=["html-preview"], interactive=False, ) gr.HTML('

→ Try these

') gr.Examples( examples=EXAMPLES, inputs=[url_input, html_input, tier_input], label="", ) convert_btn.click( fn=convert, inputs=[url_input, html_input, tier_input], outputs=[ctx_output, html_preview, metrics_output], ) url_input.submit( fn=convert, inputs=[url_input, html_input, tier_input], outputs=[ctx_output, html_preview, metrics_output], ) if __name__ == "__main__": demo.launch()