Spaces:

crusoeai
/

doc_analysis

Running

Emmanuel Acheampong

Fix startup: move theme/css to Blocks, call launch() unconditionally

9a22e75 about 16 hours ago

23.9 kB

	"""
	Crusoe Foundry — Infinite Context Demo
	HuggingFace Space showcasing MemoryAlloy™ & KV Cache sharing
	"""

	import os
	import time
	import tiktoken
	import gradio as gr
	from openai import OpenAI

	# ── Crusoe Foundry client ─────────────────────────────────────────────────────
	CRUSOE_API_KEY = os.environ.get("CRUSOE_API_KEY", "YOUR_API_KEY_HERE")
	CRUSOE_BASE_URL = os.environ.get("CRUSOE_BASE_URL", "https://managed-inference-api-proxy.crusoecloud.com/v1/")
	AVAILABLE_MODELS = [
	"Qwen/Qwen3-235B-A22B-Instruct-2507",
	"deepseek-ai/DeepSeek-R1-0528",
	"moonshotai/Kimi-K2-Thinking",
	"deepseek-ai/DeepSeek-V3-0324",
	"meta-llama/Llama-3.3-70B-Instruct",
	"openai/gpt-oss-120b",
	"google/gemma-3-12b-it",
	]
	MODEL = os.environ.get("CRUSOE_MODEL", AVAILABLE_MODELS[0])

	client = OpenAI(api_key=CRUSOE_API_KEY, base_url=CRUSOE_BASE_URL)

	# ── Token counting ────────────────────────────────────────────────────────────
	try:
	enc = tiktoken.encoding_for_model("gpt-4")
	except Exception:
	enc = tiktoken.get_encoding("cl100k_base")


	def count_tokens(text: str) -> int:
	return len(enc.encode(text))


	def format_tokens(n: int) -> str:
	if n >= 1_000_000:
	return f"{n/1_000_000:.2f}M"
	if n >= 1_000:
	return f"{n/1_000:.1f}K"
	return str(n)


	# ── Document ingestion helpers ────────────────────────────────────────────────
	def read_uploaded_file(file_path: str) -> str:
	"""Read text from uploaded file (txt, md, py, or pdf via pdfminer)."""
	if file_path is None:
	return ""
	ext = os.path.splitext(file_path)[1].lower()
	if ext == ".pdf":
	try:
	from pdfminer.high_level import extract_text
	return extract_text(file_path)
	except Exception as e:
	return f"[PDF extraction error: {e}]"
	else:
	with open(file_path, "r", errors="replace") as f:
	return f.read()


	# ── KV-cache simulation state ─────────────────────────────────────────────────
	_cache_store: dict[str, dict] = {}


	def get_cache_key(context: str) -> str:
	import hashlib
	return hashlib.md5(context.encode()).hexdigest()


	# ── Shared chat logic ─────────────────────────────────────────────────────────
	def stream_response(system_prompt: str, history: list, user_msg: str, model: str = None):
	"""
	Streams a response from Crusoe Foundry.
	Returns (updated_history, token_info_str, latency_str, error_str)
	history is a list of {"role": "user"\|"assistant", "content": str} dicts (Gradio 6.x format).
	"""
	model = model or MODEL
	messages = [{"role": "system", "content": system_prompt}]
	for msg in history:
	messages.append({"role": msg["role"], "content": msg["content"]})
	messages.append({"role": "user", "content": user_msg})

	total_ctx_tokens = sum(count_tokens(m["content"]) for m in messages)

	new_history = history + [{"role": "user", "content": user_msg}]

	t0 = time.perf_counter()
	reply = ""
	try:
	stream = client.chat.completions.create(
	model=model,
	messages=messages,
	stream=True,
	max_tokens=2048,
	)
	for chunk in stream:
	delta = chunk.choices[0].delta.content or ""
	reply += delta
	yield (
	new_history + [{"role": "assistant", "content": reply}],
	f"📄 {format_tokens(total_ctx_tokens)} tokens in context",
	f"⏱ {time.perf_counter() - t0:.2f}s",
	"",
	)
	except Exception as e:
	reply = f"❌ API error: {e}"
	yield (
	new_history + [{"role": "assistant", "content": reply}],
	f"📄 {format_tokens(total_ctx_tokens)} tokens in context",
	"—",
	str(e),
	)


	# ─────────────────────────────────────────────────────────────────────────────
	# TAB 1 — LEGAL (document Q&A)
	# ─────────────────────────────────────────────────────────────────────────────
	legal_doc_store = {"text": "", "tokens": 0}


	def legal_ingest(files):
	if not files:
	return "No files uploaded.", "0 tokens", gr.update()
	combined = ""
	for f in files:
	combined += f"\n\n--- {os.path.basename(f)} ---\n\n"
	combined += read_uploaded_file(f)
	legal_doc_store["text"] = combined
	legal_doc_store["tokens"] = count_tokens(combined)
	tok_str = format_tokens(legal_doc_store["tokens"])
	preview = combined[:800] + ("…" if len(combined) > 800 else "")
	return (
	f"✅ Loaded {len(files)} document(s) — {tok_str} tokens ingested into context.",
	f"📄 {tok_str} tokens",
	gr.update(value=preview),
	)


	def legal_chat(user_msg, history, model):
	if not user_msg.strip():
	yield history, "—", "—", ""
	return
	doc_context = legal_doc_store["text"]
	system = (
	"You are an expert analyst with access to the full text of the uploaded documents. "
	"Answer questions precisely, citing relevant sections when possible. "
	"If a question cannot be answered from the document, say so clearly.\n\n"
	f"=== DOCUMENT CONTEXT ===\n{doc_context}\n=== END CONTEXT ==="
	if doc_context
	else "You are a helpful document analyst. No documents have been loaded yet."
	)
	yield from stream_response(system, history, user_msg, model)


	# ─────────────────────────────────────────────────────────────────────────────
	# TAB 2 — DEV (codebase Q&A)
	# ─────────────────────────────────────────────────────────────────────────────
	dev_code_store = {"text": "", "tokens": 0}


	def dev_ingest(files, raw_paste):
	combined = raw_paste or ""
	for f in (files or []):
	combined += f"\n\n# === {os.path.basename(f)} ===\n\n"
	combined += read_uploaded_file(f)
	dev_code_store["text"] = combined
	dev_code_store["tokens"] = count_tokens(combined)
	tok_str = format_tokens(dev_code_store["tokens"])
	preview = combined[:800] + ("…" if len(combined) > 800 else "")
	return (
	f"✅ Codebase loaded — {tok_str} tokens in context.",
	f"📄 {tok_str} tokens",
	gr.update(value=preview),
	)


	def dev_chat(user_msg, history, model):
	if not user_msg.strip():
	yield history, "—", "—", ""
	return
	code_context = dev_code_store["text"]
	system = (
	"You are a senior software engineer with full visibility into the provided codebase. "
	"Answer questions about architecture, bugs, refactoring, and code quality. "
	"Reference specific file names, function names, and line context when relevant.\n\n"
	f"=== CODEBASE ===\n{code_context}\n=== END CODEBASE ==="
	if code_context
	else "You are a helpful coding assistant. No code has been loaded yet."
	)
	yield from stream_response(system, history, user_msg, model)


	# ─────────────────────────────────────────────────────────────────────────────
	# TAB 3 — MEMORY DEMO (KV-cache visibility)
	# ─────────────────────────────────────────────────────────────────────────────
	memory_state = {
	"cached_context": "",
	"cached_tokens": 0,
	"query_count": 0,
	"total_saved_tokens": 0,
	}


	def memory_set_context(context_text):
	memory_state["cached_context"] = context_text
	memory_state["cached_tokens"] = count_tokens(context_text)
	memory_state["query_count"] = 0
	memory_state["total_saved_tokens"] = 0
	tok_str = format_tokens(memory_state["cached_tokens"])
	return (
	f"✅ Context set — {tok_str} tokens ready. Savings below are estimated based on context size.",
	_render_cache_stats(),
	)


	def _render_cache_stats():
	q = memory_state["query_count"]
	saved = memory_state["total_saved_tokens"]
	cached_tok = memory_state["cached_tokens"]
	return (
	f"Context tokens: {format_tokens(cached_tok)}\n\n"
	f"Queries run: {q}\n\n"
	f"*Estimated tokens saved\\:** {format_tokens(saved)}\n\n"
	f"*Estimated cost savings\\:** ~${saved * 0.000003:.4f} @ $3/1M tokens\n\n"
	f"_\\* Estimates assume full KV cache reuse per query. Actual savings depend on server-side cache availability._"
	)


	def memory_chat(user_msg, history, model):
	if not user_msg.strip():
	yield history, "—", "—", _render_cache_stats(), ""
	return

	cached_ctx = memory_state["cached_context"]
	system = (
	"You are a helpful assistant with a pre-loaded context. "
	"The context below has been KV-cached — it does not need to be re-encoded for each query.\n\n"
	f"=== CACHED CONTEXT ===\n{cached_ctx}\n=== END CONTEXT ==="
	if cached_ctx
	else "You are a helpful assistant. No context has been cached yet."
	)

	# Simulate cache hit: saved tokens = cached context tokens (not re-encoded)
	memory_state["query_count"] += 1
	memory_state["total_saved_tokens"] += memory_state["cached_tokens"]

	for history_out, tok_info, latency, err in stream_response(system, history, user_msg, model):
	# Annotate with cache hit badge
	cache_badge = "🟢 Cache HIT (estimated) — context eligible for KV cache reuse" if cached_ctx else "⚪ No cache"
	yield history_out, tok_info, latency, _render_cache_stats(), cache_badge


	# ─────────────────────────────────────────────────────────────────────────────
	# GRADIO UI
	# ─────────────────────────────────────────────────────────────────────────────
	CRUSOE_BLUE = "#1B4FCC"
	CRUSOE_DARK = "#0D1B2A"

	css = """
	.crusoe-header { text-align: center; padding: 1.5rem 0 0.5rem; }
	.token-badge { font-size: 1.1rem; font-weight: 600; color: #1B4FCC; }
	.cache-stats { background: #f0f4ff; border-radius: 8px; padding: 1rem; }
	.cache-hit { color: #16a34a; font-weight: 700; font-size: 1rem; }
	.stat-row { display: flex; gap: 1.5rem; align-items: center; }
	footer { display: none !important; }
	"""

	with gr.Blocks(title="Crusoe Foundry — Infinite Context Demo", theme=gr.themes.Soft(primary_hue="blue"), css=css) as demo:

	# ── Header ────────────────────────────────────────────────────────────────
	gr.HTML("""
	<div class="crusoe-header">
	<h1 style="font-size:1.8rem;font-weight:700;color:#0D1B2A;margin:0">
	Infinite Context Demo
	</h1>
	<p style="color:#555;margin:0.3rem 0 0">
	Powered by <strong>Crusoe Foundry</strong>  ·
	MemoryAlloy™ & KV Cache Sharing
	</p>
	</div>
	""")

	with gr.Row():
	model_selector = gr.Dropdown(
	choices=AVAILABLE_MODELS,
	value=MODEL,
	label="Model",
	scale=2,
	)

	with gr.Tabs():

	# ── TAB 1: LEGAL ──────────────────────────────────────────────────────
	with gr.Tab("📄 Document Analysis"):
	gr.Markdown(
	"Upload any documents — ask questions "
	"across the entire document with no chunking or retrieval needed."
	)
	with gr.Row():
	with gr.Column(scale=1):
	legal_files = gr.File(
	label="Upload Documents (PDF, TXT, MD)",
	file_count="multiple",
	file_types=[".pdf", ".txt", ".md", ".docx"],
	)
	legal_ingest_btn = gr.Button("📥 Load into Context", variant="primary")
	legal_status = gr.Markdown("No documents loaded.")
	legal_token_badge = gr.Markdown("", elem_classes=["token-badge"])
	legal_preview = gr.Textbox(
	label="Document Preview",
	lines=6,
	interactive=False,
	placeholder="Document text will appear here after loading…",
	)
	with gr.Column(scale=2):
	legal_chatbot = gr.Chatbot(label="Document Q&A", height=420)
	with gr.Row():
	legal_input = gr.Textbox(
	placeholder="e.g. Summarize the key points of this document.",
	label="Ask a question",
	scale=4,
	)
	legal_send = gr.Button("Send", variant="primary", scale=1)
	with gr.Row():
	legal_tok_info = gr.Markdown("", elem_classes=["token-badge"])
	legal_latency = gr.Markdown("")
	legal_err = gr.Markdown("", visible=False)
	gr.Examples(
	examples=[
	["Summarize the key points of this document."],
	["What are the main topics covered?"],
	["List every date or deadline mentioned."],
	["What conclusions or recommendations are made?"],
	["Extract all named entities (people, organizations, places)."],
	],
	inputs=legal_input,
	)

	legal_ingest_btn.click(
	legal_ingest,
	inputs=[legal_files],
	outputs=[legal_status, legal_token_badge, legal_preview],
	)

	def legal_submit(msg, history, model):
	yield from legal_chat(msg, history, model)

	legal_send.click(
	legal_submit,
	inputs=[legal_input, legal_chatbot, model_selector],
	outputs=[legal_chatbot, legal_tok_info, legal_latency, legal_err],
	).then(lambda: "", outputs=legal_input)

	legal_input.submit(
	legal_submit,
	inputs=[legal_input, legal_chatbot, model_selector],
	outputs=[legal_chatbot, legal_tok_info, legal_latency, legal_err],
	).then(lambda: "", outputs=legal_input)

	# ── TAB 2: DEV ────────────────────────────────────────────────────────
	with gr.Tab("💻 Codebase Intelligence"):
	gr.Markdown(
	"Upload source files or paste code — reason across your entire codebase "
	"simultaneously. No embeddings, no retrieval, no chunking."
	)
	with gr.Row():
	with gr.Column(scale=1):
	dev_files = gr.File(
	label="Upload Source Files",
	file_count="multiple",
	file_types=[".py", ".js", ".ts", ".go", ".rs", ".java", ".txt", ".md"],
	)
	dev_paste = gr.Textbox(
	label="Or paste code directly",
	lines=8,
	placeholder="Paste your code here…",
	)
	dev_ingest_btn = gr.Button("📥 Load Codebase", variant="primary")
	dev_status = gr.Markdown("No code loaded.")
	dev_token_badge = gr.Markdown("", elem_classes=["token-badge"])
	dev_preview = gr.Textbox(
	label="Codebase Preview",
	lines=5,
	interactive=False,
	placeholder="Loaded code will appear here…",
	)
	with gr.Column(scale=2):
	dev_chatbot = gr.Chatbot(label="Codebase Q&A", height=420)
	with gr.Row():
	dev_input = gr.Textbox(
	placeholder="e.g. Where is the authentication logic and how does it work?",
	label="Ask about your codebase",
	scale=4,
	)
	dev_send = gr.Button("Send", variant="primary", scale=1)
	with gr.Row():
	dev_tok_info = gr.Markdown("", elem_classes=["token-badge"])
	dev_latency = gr.Markdown("")
	dev_err = gr.Markdown("")
	gr.Examples(
	examples=[
	["Explain the overall architecture of this codebase."],
	["Where are potential race conditions or concurrency issues?"],
	["List all API endpoints and their HTTP methods."],
	["Which functions have no error handling?"],
	["How would I add rate limiting to this service?"],
	],
	inputs=dev_input,
	)

	dev_ingest_btn.click(
	dev_ingest,
	inputs=[dev_files, dev_paste],
	outputs=[dev_status, dev_token_badge, dev_preview],
	)

	def dev_submit(msg, history, model):
	yield from dev_chat(msg, history, model)

	dev_send.click(
	dev_submit,
	inputs=[dev_input, dev_chatbot, model_selector],
	outputs=[dev_chatbot, dev_tok_info, dev_latency, dev_err],
	).then(lambda: "", outputs=dev_input)

	dev_input.submit(
	dev_submit,
	inputs=[dev_input, dev_chatbot, model_selector],
	outputs=[dev_chatbot, dev_tok_info, dev_latency, dev_err],
	).then(lambda: "", outputs=dev_input)

	# ── TAB 3: MEMORY DEMO ────────────────────────────────────────────────
	with gr.Tab("🧠 MemoryAlloy™ Demo"):
	gr.Markdown(
	"See KV cache sharing in action. Set a large context once — every subsequent "
	"query reuses the cached key-value representations, slashing compute and cost.\n\n"
	"> Note: Token savings shown below are estimated based on context size. "
	"Actual cache reuse depends on server-side KV cache availability on Crusoe Foundry."
	)
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 1. Set Shared Context")
	memory_context_input = gr.Textbox(
	label="Context to cache (paste any large text)",
	lines=12,
	placeholder="Paste a large document, knowledge base, or system context here. "
	"This will be cached and reused across all queries.",
	)
	memory_cache_btn = gr.Button("🔒 Lock into KV Cache", variant="primary")
	memory_cache_status = gr.Markdown("No context cached.")

	gr.Markdown("### 2. Cache Stats")
	memory_stats = gr.Markdown("", elem_classes=["cache-stats"])

	with gr.Column(scale=2):
	gr.Markdown("### 3. Query Against Cached Context")
	memory_chatbot = gr.Chatbot(
	label="Memory-Augmented Chat",
	height=380,
	)
	with gr.Row():
	memory_input = gr.Textbox(
	placeholder="Ask anything — the context is already cached…",
	label="Your question",
	scale=4,
	)
	memory_send = gr.Button("Send", variant="primary", scale=1)
	with gr.Row():
	memory_tok_info = gr.Markdown("", elem_classes=["token-badge"])
	memory_latency = gr.Markdown("")
	memory_cache_hit = gr.Markdown("", elem_classes=["cache-hit"])
	memory_err = gr.Markdown("")
	gr.Examples(
	examples=[
	["Summarize the key points in 3 sentences."],
	["What topics are covered in this context?"],
	["Extract all named entities mentioned."],
	["What are the most important dates or numbers?"],
	],
	inputs=memory_input,
	)

	memory_cache_btn.click(
	memory_set_context,
	inputs=[memory_context_input],
	outputs=[memory_cache_status, memory_stats],
	)

	def memory_submit(msg, history, model):
	yield from memory_chat(msg, history, model)

	memory_send.click(
	memory_submit,
	inputs=[memory_input, memory_chatbot, model_selector],
	outputs=[memory_chatbot, memory_tok_info, memory_latency, memory_stats, memory_cache_hit],
	).then(lambda: "", outputs=memory_input)

	memory_input.submit(
	memory_submit,
	inputs=[memory_input, memory_chatbot, model_selector],
	outputs=[memory_chatbot, memory_tok_info, memory_latency, memory_stats, memory_cache_hit],
	).then(lambda: "", outputs=memory_input)

	# ── Footer ────────────────────────────────────────────────────────────────
	gr.HTML("""
	<div style="text-align:center;color:#888;padding:1.5rem 0 0.5rem;font-size:0.85rem">
	Built on <strong>Crusoe Foundry</strong>  ·
	Sustainable AI compute  ·
	<a href="https://crusoe.ai" target="_blank">crusoe.ai</a>
	</div>
	""")


	demo.launch()