""" Crusoe Foundry — Infinite Context Demo HuggingFace Space showcasing MemoryAlloy™ & KV Cache sharing """ import os import time import tiktoken import gradio as gr from openai import OpenAI # ── Crusoe Foundry client ───────────────────────────────────────────────────── CRUSOE_API_KEY = os.environ.get("CRUSOE_API_KEY", "YOUR_API_KEY_HERE") CRUSOE_BASE_URL = os.environ.get("CRUSOE_BASE_URL", "https://managed-inference-api-proxy.crusoecloud.com/v1/") AVAILABLE_MODELS = [ "Qwen/Qwen3-235B-A22B-Instruct-2507", "deepseek-ai/DeepSeek-R1-0528", "moonshotai/Kimi-K2-Thinking", "deepseek-ai/DeepSeek-V3-0324", "meta-llama/Llama-3.3-70B-Instruct", "openai/gpt-oss-120b", "google/gemma-3-12b-it", ] MODEL = os.environ.get("CRUSOE_MODEL", AVAILABLE_MODELS[0]) client = OpenAI(api_key=CRUSOE_API_KEY, base_url=CRUSOE_BASE_URL) # ── Token counting ──────────────────────────────────────────────────────────── try: enc = tiktoken.encoding_for_model("gpt-4") except Exception: enc = tiktoken.get_encoding("cl100k_base") def count_tokens(text: str) -> int: return len(enc.encode(text)) def format_tokens(n: int) -> str: if n >= 1_000_000: return f"{n/1_000_000:.2f}M" if n >= 1_000: return f"{n/1_000:.1f}K" return str(n) # ── Document ingestion helpers ──────────────────────────────────────────────── def read_uploaded_file(file_path: str) -> str: """Read text from uploaded file (txt, md, py, or pdf via pdfminer).""" if file_path is None: return "" ext = os.path.splitext(file_path)[1].lower() if ext == ".pdf": try: from pdfminer.high_level import extract_text return extract_text(file_path) except Exception as e: return f"[PDF extraction error: {e}]" else: with open(file_path, "r", errors="replace") as f: return f.read() # ── KV-cache simulation state ───────────────────────────────────────────────── _cache_store: dict[str, dict] = {} def get_cache_key(context: str) -> str: import hashlib return hashlib.md5(context.encode()).hexdigest() # ── Shared chat logic ───────────────────────────────────────────────────────── def stream_response(system_prompt: str, history: list, user_msg: str, model: str = None): """ Streams a response from Crusoe Foundry. Returns (updated_history, token_info_str, latency_str, error_str) history is a list of {"role": "user"|"assistant", "content": str} dicts (Gradio 6.x format). """ model = model or MODEL messages = [{"role": "system", "content": system_prompt}] for msg in history: messages.append({"role": msg["role"], "content": msg["content"]}) messages.append({"role": "user", "content": user_msg}) total_ctx_tokens = sum(count_tokens(m["content"]) for m in messages) new_history = history + [{"role": "user", "content": user_msg}] t0 = time.perf_counter() reply = "" try: stream = client.chat.completions.create( model=model, messages=messages, stream=True, max_tokens=2048, ) for chunk in stream: delta = chunk.choices[0].delta.content or "" reply += delta yield ( new_history + [{"role": "assistant", "content": reply}], f"📄 **{format_tokens(total_ctx_tokens)} tokens** in context", f"⏱ {time.perf_counter() - t0:.2f}s", "", ) except Exception as e: reply = f"❌ API error: {e}" yield ( new_history + [{"role": "assistant", "content": reply}], f"📄 {format_tokens(total_ctx_tokens)} tokens in context", "—", str(e), ) # ───────────────────────────────────────────────────────────────────────────── # TAB 1 — LEGAL (document Q&A) # ───────────────────────────────────────────────────────────────────────────── legal_doc_store = {"text": "", "tokens": 0} def legal_ingest(files): if not files: return "No files uploaded.", "0 tokens", gr.update() combined = "" for f in files: combined += f"\n\n--- {os.path.basename(f)} ---\n\n" combined += read_uploaded_file(f) legal_doc_store["text"] = combined legal_doc_store["tokens"] = count_tokens(combined) tok_str = format_tokens(legal_doc_store["tokens"]) preview = combined[:800] + ("…" if len(combined) > 800 else "") return ( f"✅ Loaded {len(files)} document(s) — **{tok_str} tokens** ingested into context.", f"📄 {tok_str} tokens", gr.update(value=preview), ) def legal_chat(user_msg, history, model): if not user_msg.strip(): yield history, "—", "—", "" return doc_context = legal_doc_store["text"] system = ( "You are an expert analyst with access to the full text of the uploaded documents. " "Answer questions precisely, citing relevant sections when possible. " "If a question cannot be answered from the document, say so clearly.\n\n" f"=== DOCUMENT CONTEXT ===\n{doc_context}\n=== END CONTEXT ===" if doc_context else "You are a helpful document analyst. No documents have been loaded yet." ) yield from stream_response(system, history, user_msg, model) # ───────────────────────────────────────────────────────────────────────────── # TAB 2 — DEV (codebase Q&A) # ───────────────────────────────────────────────────────────────────────────── dev_code_store = {"text": "", "tokens": 0} def dev_ingest(files, raw_paste): combined = raw_paste or "" for f in (files or []): combined += f"\n\n# === {os.path.basename(f)} ===\n\n" combined += read_uploaded_file(f) dev_code_store["text"] = combined dev_code_store["tokens"] = count_tokens(combined) tok_str = format_tokens(dev_code_store["tokens"]) preview = combined[:800] + ("…" if len(combined) > 800 else "") return ( f"✅ Codebase loaded — **{tok_str} tokens** in context.", f"📄 {tok_str} tokens", gr.update(value=preview), ) def dev_chat(user_msg, history, model): if not user_msg.strip(): yield history, "—", "—", "" return code_context = dev_code_store["text"] system = ( "You are a senior software engineer with full visibility into the provided codebase. " "Answer questions about architecture, bugs, refactoring, and code quality. " "Reference specific file names, function names, and line context when relevant.\n\n" f"=== CODEBASE ===\n{code_context}\n=== END CODEBASE ===" if code_context else "You are a helpful coding assistant. No code has been loaded yet." ) yield from stream_response(system, history, user_msg, model) # ───────────────────────────────────────────────────────────────────────────── # TAB 3 — MEMORY DEMO (KV-cache visibility) # ───────────────────────────────────────────────────────────────────────────── memory_state = { "cached_context": "", "cached_tokens": 0, "query_count": 0, "total_saved_tokens": 0, } def memory_set_context(context_text): memory_state["cached_context"] = context_text memory_state["cached_tokens"] = count_tokens(context_text) memory_state["query_count"] = 0 memory_state["total_saved_tokens"] = 0 tok_str = format_tokens(memory_state["cached_tokens"]) return ( f"✅ Context set — **{tok_str} tokens** ready. Savings below are estimated based on context size.", _render_cache_stats(), ) def _render_cache_stats(): q = memory_state["query_count"] saved = memory_state["total_saved_tokens"] cached_tok = memory_state["cached_tokens"] return ( f"**Context tokens:** {format_tokens(cached_tok)}\n\n" f"**Queries run:** {q}\n\n" f"**Estimated tokens saved\\*:** {format_tokens(saved)}\n\n" f"**Estimated cost savings\\*:** ~${saved * 0.000003:.4f} @ $3/1M tokens\n\n" f"_\\* Estimates assume full KV cache reuse per query. Actual savings depend on server-side cache availability._" ) def memory_chat(user_msg, history, model): if not user_msg.strip(): yield history, "—", "—", _render_cache_stats(), "" return cached_ctx = memory_state["cached_context"] system = ( "You are a helpful assistant with a pre-loaded context. " "The context below has been KV-cached — it does not need to be re-encoded for each query.\n\n" f"=== CACHED CONTEXT ===\n{cached_ctx}\n=== END CONTEXT ===" if cached_ctx else "You are a helpful assistant. No context has been cached yet." ) # Simulate cache hit: saved tokens = cached context tokens (not re-encoded) memory_state["query_count"] += 1 memory_state["total_saved_tokens"] += memory_state["cached_tokens"] for history_out, tok_info, latency, err in stream_response(system, history, user_msg, model): # Annotate with cache hit badge cache_badge = "🟢 **Cache HIT (estimated)** — context eligible for KV cache reuse" if cached_ctx else "⚪ No cache" yield history_out, tok_info, latency, _render_cache_stats(), cache_badge # ───────────────────────────────────────────────────────────────────────────── # GRADIO UI # ───────────────────────────────────────────────────────────────────────────── CRUSOE_BLUE = "#1B4FCC" CRUSOE_DARK = "#0D1B2A" css = """ .crusoe-header { text-align: center; padding: 1.5rem 0 0.5rem; } .token-badge { font-size: 1.1rem; font-weight: 600; color: #1B4FCC; } .cache-stats { background: #f0f4ff; border-radius: 8px; padding: 1rem; } .cache-hit { color: #16a34a; font-weight: 700; font-size: 1rem; } .stat-row { display: flex; gap: 1.5rem; align-items: center; } footer { display: none !important; } """ with gr.Blocks(title="Crusoe Foundry — Infinite Context Demo", theme=gr.themes.Soft(primary_hue="blue"), css=css) as demo: # ── Header ──────────────────────────────────────────────────────────────── gr.HTML("""
Powered by Crusoe Foundry · MemoryAlloy™ & KV Cache Sharing