Spaces:

fuhaddesmond
/

kimi-k25-coding-platform

Paused

App Files Files Community

kimi-k25-coding-platform / app.py

fuhaddesmond

Upload app.py with huggingface_hub

98fe812 verified about 1 month ago

raw

history blame contribute delete

10.2 kB

	import gradio as gr
	from openai import OpenAI
	import os

	# ──────────────────────────────────────────────────────────────
	# Kimi K2.5 Production AI Coding Assistant
	# Architecture: OpenAI-compatible client → NVIDIA API → Kimi K2.5
	# ──────────────────────────────────────────────────────────────

	# Initialize the OpenAI-compatible client pointing at NVIDIA's inference servers.
	# This keeps your agent "backend-swappable" — change base_url to switch providers
	# (NVIDIA → SiliconFlow → Moonshot → local vLLM) without rewriting any logic.
	client = OpenAI(
	base_url="https://integrate.api.nvidia.com/v1",
	api_key=os.getenv("NVIDIA_API_KEY")
	)

	MODEL_ID = "moonshotai/kimi-k2.5"

	SYSTEM_PROMPT = """You are Kimi K2.5, an advanced AI coding assistant built by Moonshot AI.
	You are an expert in software engineering across all programming languages and frameworks.

	Your capabilities include:
	- Writing, reviewing, and debugging code in any language
	- Explaining complex programming concepts clearly
	- Suggesting best practices, design patterns, and architectural decisions
	- Analyzing code for performance, security, and maintainability
	- Generating complete applications, APIs, and systems
	- Helping with DevOps, databases, cloud infrastructure, and more

	When writing code:
	- Always use proper code blocks with language identifiers
	- Include comments for complex logic
	- Follow language-specific conventions and best practices
	- Provide complete, runnable code when possible
	- Suggest tests when appropriate

	When explaining:
	- Be thorough but concise
	- Use examples to illustrate concepts
	- Break complex topics into digestible parts
	- Reference relevant documentation or standards when helpful

	Be direct, accurate, and helpful. If you're unsure about something, say so rather than guessing."""


	def respond(message, chat_history, enable_thinking):
	"""
	Stream a response from Kimi K2.5 via NVIDIA API.
	Uses Gradio's standard streaming chatbot pattern.
	"""
	if not message.strip():
	yield message, chat_history
	return

	# Build conversation history in OpenAI message format
	messages = [{"role": "system", "content": SYSTEM_PROMPT}]

	# chat_history is a list of [user_msg, assistant_msg] pairs
	for pair in chat_history:
	if len(pair) >= 2 and pair[0] and pair[1]:
	messages.append({"role": "user", "content": pair[0]})
	messages.append({"role": "assistant", "content": pair[1]})

	messages.append({"role": "user", "content": message})

	# Add user message to chat history immediately
	chat_history.append([message, ""])

	try:
	# Call NVIDIA's OpenAI-compatible streaming endpoint
	kwargs = {
	"model": MODEL_ID,
	"messages": messages,
	"stream": True,
	"temperature": 1.0,
	"top_p": 1.0,
	"max_tokens": 16384,
	}

	if enable_thinking:
	kwargs["extra_body"] = {"chat_template_kwargs": {"thinking": True}}

	stream = client.chat.completions.create(**kwargs)

	thinking_content = ""
	response_content = ""
	in_thinking_phase = False
	thinking_finished = False

	for chunk in stream:
	if not chunk.choices:
	continue

	delta = chunk.choices[0].delta

	# Handle thinking/reasoning content
	if hasattr(delta, "reasoning_content") and delta.reasoning_content:
	if not in_thinking_phase:
	in_thinking_phase = True
	thinking_content += delta.reasoning_content

	# Show thinking phase in the chat
	display = f"🧠 Thinking...\n\n{thinking_content}"
	chat_history[-1][1] = display
	yield "", chat_history

	# Handle regular content
	if delta.content:
	if in_thinking_phase and not thinking_finished:
	thinking_finished = True
	in_thinking_phase = False

	response_content += delta.content

	if thinking_content:
	display = (
	f"<details><summary>🧠 Reasoning (click to expand)</summary>\n\n"
	f"{thinking_content}\n\n</details>\n\n"
	f"{response_content}"
	)
	else:
	display = response_content

	chat_history[-1][1] = display
	yield "", chat_history

	# Final output — make sure something is there
	if not chat_history[-1][1]:
	if thinking_content:
	chat_history[-1][1] = f"🧠 Reasoning:\n\n{thinking_content}"
	else:
	chat_history[-1][1] = "No response received. Please try again."

	yield "", chat_history

	except Exception as e:
	error_msg = str(e)
	if "401" in error_msg or "auth" in error_msg.lower():
	chat_history[-1][1] = "❌ Authentication Error: NVIDIA_API_KEY is missing or invalid. Set it in Space Secrets."
	elif "429" in error_msg:
	chat_history[-1][1] = "❌ Rate Limit: Too many requests. Wait a moment and try again."
	elif "500" in error_msg or "502" in error_msg or "503" in error_msg:
	chat_history[-1][1] = "❌ Server Error: NVIDIA API temporarily unavailable. Try again shortly."
	else:
	chat_history[-1][1] = f"❌ Error: {error_msg}"
	yield "", chat_history


	# ──────────────────────────────────────────────────────────────
	# Gradio UI — Production Coding Assistant Interface
	# ──────────────────────────────────────────────────────────────

	with gr.Blocks(
	theme=gr.themes.Soft(
	primary_hue="violet",
	secondary_hue="cyan",
	neutral_hue="zinc",
	font=gr.themes.GoogleFont("Inter"),
	),
	css="""
	footer { display: none !important; }
	""",
	title="Kimi K2.5 — AI Coding Assistant",
	) as demo:

	gr.HTML("""
	<div style="text-align: center; padding: 24px 0 16px 0;">
	<div style="display: inline-flex; align-items: center; gap: 12px; margin-bottom: 8px;">
	<div style="width: 48px; height: 48px; border-radius: 12px; background: linear-gradient(135deg, #7c3aed, #06b6d4); display: flex; align-items: center; justify-content: center; font-size: 24px; box-shadow: 0 0 24px rgba(139,92,246,0.3);">⚡</div>
	<div style="text-align: left;">
	<h1 style="margin: 0; font-size: 28px; font-weight: 700; background: linear-gradient(135deg, #a78bfa, #22d3ee); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">Kimi K2.5</h1>
	<p style="margin: 0; font-size: 13px; color: #71717a;">AI Coding Assistant • Powered by NVIDIA H100s</p>
	</div>
	</div>
	<div style="display: flex; gap: 12px; justify-content: center; margin-top: 12px; flex-wrap: wrap;">
	<span style="padding: 4px 12px; border-radius: 9999px; background: rgba(139,92,246,0.1); border: 1px solid rgba(139,92,246,0.2); font-size: 11px; color: #a78bfa;">1.1T Parameters</span>
	<span style="padding: 4px 12px; border-radius: 9999px; background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2); font-size: 11px; color: #22d3ee;">384 MoE Experts</span>
	<span style="padding: 4px 12px; border-radius: 9999px; background: rgba(52,211,153,0.1); border: 1px solid rgba(52,211,153,0.2); font-size: 11px; color: #34d399;">262K Context</span>
	<span style="padding: 4px 12px; border-radius: 9999px; background: rgba(251,191,36,0.1); border: 1px solid rgba(251,191,36,0.2); font-size: 11px; color: #fbbf24;">Thinking Mode</span>
	</div>
	</div>
	""")

	enable_thinking = gr.Checkbox(
	value=True,
	label="🧠 Enable Thinking Mode (chain-of-thought reasoning)",
	info="When enabled, Kimi K2.5 shows its reasoning process before answering."
	)

	chatbot = gr.Chatbot(
	height=500,
	show_copy_button=True,
	)

	with gr.Row():
	msg_input = gr.Textbox(
	scale=9,
	placeholder="Ask Kimi K2.5 to write, debug, explain, or review code...",
	show_label=False,
	lines=2,
	max_lines=6,
	)
	submit_btn = gr.Button("Send", scale=1, variant="primary")

	gr.Examples(
	examples=[
	"Build a REST API with authentication in Node.js",
	"Debug this Python function returning wrong results",
	"Refactor my React component for better performance",
	"Explain how async/await works in Rust",
	"Design a database schema for an e-commerce platform",
	"Write a CLI tool in Go that parses CSV files",
	],
	inputs=msg_input,
	label="Try these examples",
	)

	gr.HTML("""
	<div style="text-align: center; padding: 12px 0; color: #52525b; font-size: 11px;">
	Kimi K2.5 can make mistakes. Review generated code carefully before using in production.
	<br>Backend-swappable: NVIDIA → SiliconFlow → Moonshot — change base_url in 5 seconds.
	</div>
	""")

	# Wire up the chat using Gradio's standard streaming pattern
	msg_input.submit(
	respond,
	inputs=[msg_input, chatbot, enable_thinking],
	outputs=[msg_input, chatbot],
	)
	submit_btn.click(
	respond,
	inputs=[msg_input, chatbot, enable_thinking],
	outputs=[msg_input, chatbot],
	)


	if __name__ == "__main__":
	demo.launch()