""" CTX: Cross-Session Memory for Claude Code HuggingFace Space — Interactive Demo (v1.0) Shows G1 (decision recall) and G2 (file retrieval) before/after comparison. Fully self-contained — no backend, no vault.db, no vec-daemon required. """ import gradio as gr # ── G1 scenarios: simulated before/after decision recall ────────────────────── G1_SCENARIOS = { "Why did we switch from TF-IDF to BM25?": { "without": ( "I don't have context about previous decisions in this project. " "Could you tell me more about the retrieval system you're working on?" ), "injected": [ { "date": "2026-03-27", "decision": "Switched retrieval scorer from TF-IDF to BM25", "reason": "keyword R@3 improved 0.379 → 0.655 (+27.3pp). " "TF-IDF penalized domain terms via low IDF on small corpora.", }, { "date": "2026-03-27", "decision": "BM25Okapi (IDF-weighted) also underperformed — routed keyword queries to TF-only BM25", "reason": "Domain terms ('retrieval', 'ctx') appear in many docs → IDF ≈ 0. " "TF-only variant brought keyword R@3 to 0.724.", }, { "date": "2026-04-03", "decision": "External codebase R@5 gap (0.152) identified as next priority", "reason": "Heuristic over-fitting to internal 29-doc corpus. AST parser upgrade queued.", }, ], "with": ( "We switched from TF-IDF to BM25 on 2026-03-27 because keyword R@3 improved from " "0.379 → 0.655 (+27.3pp). The root cause: TF-IDF assigned near-zero IDF to domain " "terms like 'retrieval' and 'ctx' that appear across many documents in the small " "29-doc corpus, effectively ignoring the most relevant terms.\n\n" "We later discovered BM25Okapi (IDF-weighted) had the same problem, so keyword queries " "were re-routed to a TF-only BM25 variant — bringing keyword R@3 to 0.724." ), }, "What were the 3 telemetry schema bugs?": { "without": ( "I'm not aware of any telemetry schema fixes in my current context. " "Could you describe the issue?" ), "injected": [ { "date": "2026-04-29", "decision": "Fixed G1 block key mismatch: 'g1' → 'g1_decisions' in bm25-memory.py", "reason": "by_block key 'g1' never matched meta dict key 'g1_decisions' " "→ all G1 records logged UNKNOWN for query_type, retrieval_method, top_score.", }, { "date": "2026-04-29", "decision": "Fixed UNKNOWN string truthy bug in utility-rate.py", "reason": "'UNKNOWN' is truthy → or-chain skipped fallback classifier. " "Replaced with explicit != 'UNKNOWN' guard.", }, { "date": "2026-04-29", "decision": "G2-DOCS candidates_returned was hardcoded None", "reason": "Now calls build_docs_bm25() to get actual corpus size — " "consistent with G1 which uses len(corpus).", }, ], "with": ( "Three telemetry schema bugs were fixed on 2026-04-29:\n\n" "1. **Block key mismatch** — bm25-memory.py keyed the G1 injection block as 'g1' " "but utility-rate.py looked for 'g1_decisions'. Every G1 record logged UNKNOWN " "for all metadata fields.\n\n" "2. **UNKNOWN truthy bug** — the string 'UNKNOWN' is truthy in Python, so " "`block_meta.get('query_type') or _classify_query()` never called the fallback " "classifier. Fixed with an explicit `!= 'UNKNOWN'` guard.\n\n" "3. **candidates_returned hardcoded None** — G2-DOCS was always logging None " "instead of calling build_docs_bm25() for the actual corpus size." ), }, "Why does chat-memory need vec-daemon?": { "without": ( "I don't have details about chat-memory's architecture in my current context." ), "injected": [ { "date": "2026-04-17", "decision": "chat-memory.py uses hybrid BM25 + vec0 (multilingual-e5-small, 384-dim)", "reason": "Semantic rescue: dense recovers paraphrase queries BM25 misses. " "G1 hybrid Recall@7 = 0.983 vs BM25 0.967.", }, { "date": "2026-04-17", "decision": "vec-daemon communicates via Unix socket at ~/.local/share/claude-vault/", "reason": "< 1ms IPC round-trip. Windows-native has no Unix socket — P0 distribution blocker.", }, { "date": "2026-04-17", "decision": "bm25-memory.py has zero vec-daemon dependency by design", "reason": "BM25 path must work standalone. chat-memory falls back to BM25-only " "with ⚠ warning when daemon is down.", }, ], "with": ( "chat-memory.py uses vec-daemon for hybrid BM25 + semantic search. vec-daemon runs " "multilingual-e5-small (384-dim) and communicates via a Unix domain socket at " "~/.local/share/claude-vault/, giving <1ms round-trip latency.\n\n" "The semantic layer rescues paraphrase queries that BM25 misses — G1 hybrid " "Recall@7 is 0.983 vs 0.967 BM25-only.\n\n" "Unix sockets don't exist on Windows-native, making this a P0 distribution " "blocker for v1.0. bm25-memory.py was deliberately designed with zero vec-daemon " "dependency so the BM25-only path always works as a fallback." ), }, } # ── G2 scenarios: simulated file retrieval ──────────────────────────────────── G2_SCENARIOS = { "Update the BM25 scoring weights in the retrieval scorer": { "files": [ { "path": "src/retrieval/adaptive_trigger.py", "line": 214, "score": 0.91, "reason": "BM25 weight parameters — _bm25_retrieve(), _concept_retrieve() scoring", }, { "path": "benchmarks/eval/doc_retrieval_eval_v2.py", "line": 89, "score": 0.74, "reason": "BM25-augmented scoring in rank_ctx_doc() — stage 2 weight constants", }, { "path": "src/retrieval/full_context.py", "line": 12, "score": 0.61, "reason": "RetrievalResult dataclass — score field definitions", }, ], "tool_calls_without": 7, "latency_ms": 0.8, }, "Fix the telemetry block key mismatch": { "files": [ { "path": "~/.claude/hooks/bm25-memory.py", "line": 312, "score": 0.94, "reason": "G1 injection block keyed 'g1' — must match meta dict key 'g1_decisions'", }, { "path": "~/.claude/hooks/utility-rate.py", "line": 47, "score": 0.88, "reason": "block_meta lookup + UNKNOWN fallback guard — cm_block_meta merge logic", }, ], "tool_calls_without": 5, "latency_ms": 0.6, }, "Add Korean query expansion to G2-DOCS search": { "files": [ { "path": "~/.claude/hooks/bm25-memory.py", "line": 178, "score": 0.96, "reason": "_KO_EN_DOCS dict + _expand_ko_en_docs() — Korean→English expansion for docs BM25", }, { "path": "docs/research/20260426-g2-docs-korean-crosslingual-fix.md", "line": None, "score": 0.82, "reason": "Korean crosslingual fix design — H@5 0.400→1.000 after expansion", }, { "path": "benchmarks/eval/g2_docs_eval.py", "line": 203, "score": 0.67, "reason": "Korean query goldset + eval harness", }, ], "tool_calls_without": 9, "latency_ms": 0.7, }, } # ── Benchmark data ──────────────────────────────────────────────────────────── BENCHMARKS = [ ("G1 Decision Recall", "1.000", "0.219", "Recall@7", "MiniMax M2.5 downstream eval"), ("G2 Docs Retrieval", "1.000", "0.800", "H@5 (Hybrid)", "20-query goldset, 87 docs"), ("G2 Code Retrieval", "0.958", "0.946", "R@5", "vs Nemotron-Cascade-2"), ("Hallucination Rate", "0.000", "0.170", "rate ↓", "0% with CTX vs 17% without"), ("Hook Latency", "< 1 ms", "—", "per prompt", "Pure BM25 — no LLM, no embedding"), ("Utility Rate", "50%", "—", "tool-use turns", "Context actually cited by Claude"), ] # ── Custom CSS ──────────────────────────────────────────────────────────────── CSS = """ .ctx-header { text-align: center; padding: 1.5rem 0 0.5rem; } .ctx-header h1 { font-size: 2rem; font-weight: 700; margin-bottom: 0.25rem; } .ctx-header p { color: #6b7280; font-size: 1rem; margin: 0; } .panel-without { background: #fff8f8; border: 1px solid #fca5a5; border-radius: 8px; padding: 1rem; } .panel-inject { background: #f0fdf4; border: 1px solid #86efac; border-radius: 8px; padding: 1rem; } .panel-with { background: #eff6ff; border: 1px solid #93c5fd; border-radius: 8px; padding: 1rem; } .inject-item { background: #f9fafb; border-left: 3px solid #10b981; border-radius: 4px; padding: 0.6rem 0.8rem; margin-bottom: 0.5rem; font-size: 0.875rem; } .inject-date { font-weight: 600; color: #059669; } .inject-decision { font-weight: 500; margin: 0.2rem 0; } .inject-reason { color: #6b7280; font-size: 0.8rem; } .file-item { background: #f9fafb; border-left: 3px solid #6366f1; border-radius: 4px; padding: 0.6rem 0.8rem; margin-bottom: 0.5rem; font-size: 0.875rem; } .file-path { font-family: monospace; font-weight: 600; color: #4f46e5; } .file-score { float: right; background: #e0e7ff; color: #4338ca; padding: 0.1rem 0.4rem; border-radius: 4px; font-size: 0.75rem; font-weight: 700; } .file-reason { color: #6b7280; font-size: 0.8rem; margin-top: 0.25rem; } .bench-table { width: 100%; border-collapse: collapse; font-size: 0.9rem; } .bench-table th { background: #1e293b; color: white; padding: 0.6rem 0.8rem; text-align: left; } .bench-table td { padding: 0.55rem 0.8rem; border-bottom: 1px solid #e2e8f0; } .bench-table tr:hover td { background: #f8fafc; } .good { color: #059669; font-weight: 700; } .note-cell { color: #6b7280; font-size: 0.8rem; } .install-box { background: #0f172a; color: #e2e8f0; border-radius: 8px; padding: 1rem 1.25rem; font-family: monospace; font-size: 0.95rem; } .install-comment { color: #64748b; } .install-cmd { color: #34d399; } .stat-box { text-align: center; background: #f8fafc; border: 1px solid #e2e8f0; border-radius: 8px; padding: 1rem; } .stat-num { font-size: 2rem; font-weight: 700; color: #4f46e5; } .stat-label { font-size: 0.8rem; color: #6b7280; margin-top: 0.25rem; } """ # ── Helper: render G1 injection block as HTML ───────────────────────────────── def _render_injection(items: list) -> str: html = "
" html += f"📥 CTX G1 — {len(items)} decisions injected
" for d in items: html += ( f"
" f"{d['date']}
" f"
→ {d['decision']}
" f"
Reason: {d['reason']}
" f"
" ) return html def _render_files(files: list, latency_ms: float) -> str: html = "
" html += f"📂 CTX G2 — {len(files)} files injected in {latency_ms}ms
" for f in files: line_str = f":L{f['line']}" if f["line"] else "" html += ( f"
" f"score {f['score']:.2f}" f"{f['path']}{line_str}
" f"
→ {f['reason']}
" f"
" ) return html def _render_bench_table() -> str: rows = "" for name, with_ctx, without_ctx, unit, note in BENCHMARKS: rows += ( f"" f"{name}" f"{with_ctx}" f"{without_ctx}" f"{unit}" f"{note}" f"" ) return ( "" "" "" "" f"{rows}" "
MetricWith CTXWithout CTXUnitNotes
" ) # ── Event handlers ──────────────────────────────────────────────────────────── def run_g1(scenario_key: str): s = G1_SCENARIOS[scenario_key] without_html = ( "
" "❌ Without CTX

" f"{s['without']}" "
" ) inject_html = ( "
" + _render_injection(s["injected"]) + "
" ) with_html = ( "
" "✅ With CTX

" + s["with"].replace("\n\n", "

").replace("\n", "
") + "
" ) return without_html, inject_html, with_html def run_g2(task_key: str): s = G2_SCENARIOS[task_key] without_html = ( "
" "❌ Without CTX

" f"Claude runs {s['tool_calls_without']} Grep/Glob tool calls " "scanning the codebase before locating the relevant file." "
" ) files_html = ( "
" + _render_files(s["files"], s["latency_ms"]) + "
" ) with_html = ( "
" "✅ With CTX

" f"Claude opens the correct file on the first tool call. " f"Context injected in {s['latency_ms']}ms — " "no directory scan, no grep loop." "
" ) return without_html, files_html, with_html # ── Build Gradio app ────────────────────────────────────────────────────────── with gr.Blocks(css=CSS, title="CTX — Cross-Session Memory for Claude Code") as demo: gr.HTML( "
" "

🧠 CTX

" "

Cross-session memory for Claude Code — BM25 + hybrid semantic retrieval, " "<1ms latency, zero LLM calls.

" "
" ) with gr.Tabs(): # ── Tab 1: G1 Demo ───────────────────────────────────────────────────── with gr.TabItem("G1 — Decision Memory"): gr.Markdown( "**G1** recalls past engineering decisions from your git history and previous sessions. " "Pick a question, click Run — see what Claude says with and without CTX." ) g1_scenario = gr.Dropdown( choices=list(G1_SCENARIOS.keys()), value=list(G1_SCENARIOS.keys())[0], label="Scenario", interactive=True, ) g1_btn = gr.Button("▶ Run comparison", variant="primary") with gr.Row(): g1_without = gr.HTML(label="Without CTX") with gr.Row(): g1_inject = gr.HTML(label="CTX injection") with gr.Row(): g1_with = gr.HTML(label="With CTX") g1_btn.click( fn=run_g1, inputs=g1_scenario, outputs=[g1_without, g1_inject, g1_with], ) # ── Tab 2: G2 Demo ───────────────────────────────────────────────────── with gr.TabItem("G2 — File Retrieval"): gr.Markdown( "**G2** finds the exact file and line number before Claude's first tool call. " "Searches docs, codebase, and hooks simultaneously — in under 1ms." ) g2_task = gr.Dropdown( choices=list(G2_SCENARIOS.keys()), value=list(G2_SCENARIOS.keys())[0], label="Task", interactive=True, ) g2_btn = gr.Button("▶ Run comparison", variant="primary") with gr.Row(): g2_without = gr.HTML(label="Without CTX") with gr.Row(): g2_files = gr.HTML(label="CTX injection") with gr.Row(): g2_with = gr.HTML(label="With CTX") g2_btn.click( fn=run_g2, inputs=g2_task, outputs=[g2_without, g2_files, g2_with], ) # ── Tab 3: Benchmarks ────────────────────────────────────────────────── with gr.TabItem("Benchmarks"): gr.Markdown("Results from empirical evaluation across G1 (decision recall) and G2 (retrieval) surfaces.") with gr.Row(): gr.HTML("
1.000
G1 Recall@7
with CTX
") gr.HTML("
0%
Hallucination rate
with CTX
") gr.HTML("
<1ms
Hook latency
per prompt
") gr.HTML("
50%
Utility rate
(cited turns)
") gr.HTML(_render_bench_table()) gr.Markdown( "_G1 downstream eval: MiniMax M2.5, synthetic 32-query benchmark. " "G2 docs: 20-query goldset over 87 docs. " "G2 code: COIR RepoBench-style held-out eval._" ) # ── Tab 4: Install ───────────────────────────────────────────────────── with gr.TabItem("Install"): gr.Markdown("## Linux + WSL2 — one command") gr.HTML( "
" "# install + wire hooks into ~/.claude/settings.json
" "pip install ctx-retriever && ctx-install

" "# verify
" "ctx-install status" "
" ) gr.Markdown( "**What `ctx-install` does**\n" "1. Copies 4 hook files to `~/.claude/hooks/`\n" "2. Takes a timestamped backup of `~/.claude/settings.json`\n" "3. Merges CTX hook registrations — never overwrites your existing hooks\n" "4. Atomically writes the new settings.json\n" "5. Smoke-tests by firing `bm25-memory.py` once and confirming output\n\n" "Restart Claude Code after install. Hooks fire on every prompt automatically.\n\n" "> **Platform note**: v1.0 supports **Linux native and WSL2** only. " "Windows-native (Git Bash/MSYS2) support is in progress — " "blocked on upstream Claude Code issue [#34457](https://github.com/anthropics/claude-code/issues/34457).\n\n" "**Links**\n" "- [GitHub](https://github.com/jaytoone/CTX)\n" "- [CONTRIBUTING.md](https://github.com/jaytoone/CTX/blob/master/CONTRIBUTING.md)\n" "- [MIT License](https://github.com/jaytoone/CTX/blob/master/LICENSE)" ) gr.HTML( "
" "CTX v1.0 · MIT License · " "github.com/jaytoone/CTX" "
" ) if __name__ == "__main__": demo.launch()