import gradio as gr from openai import OpenAI import os # ────────────────────────────────────────────────────────────── # Kimi K2.5 Production AI Coding Assistant # Architecture: OpenAI-compatible client → NVIDIA API → Kimi K2.5 # ────────────────────────────────────────────────────────────── # Initialize the OpenAI-compatible client pointing at NVIDIA's inference servers. # This keeps your agent "backend-swappable" — change base_url to switch providers # (NVIDIA → SiliconFlow → Moonshot → local vLLM) without rewriting any logic. client = OpenAI( base_url="https://integrate.api.nvidia.com/v1", api_key=os.getenv("NVIDIA_API_KEY") ) MODEL_ID = "moonshotai/kimi-k2.5" SYSTEM_PROMPT = """You are Kimi K2.5, an advanced AI coding assistant built by Moonshot AI. You are an expert in software engineering across all programming languages and frameworks. Your capabilities include: - Writing, reviewing, and debugging code in any language - Explaining complex programming concepts clearly - Suggesting best practices, design patterns, and architectural decisions - Analyzing code for performance, security, and maintainability - Generating complete applications, APIs, and systems - Helping with DevOps, databases, cloud infrastructure, and more When writing code: - Always use proper code blocks with language identifiers - Include comments for complex logic - Follow language-specific conventions and best practices - Provide complete, runnable code when possible - Suggest tests when appropriate When explaining: - Be thorough but concise - Use examples to illustrate concepts - Break complex topics into digestible parts - Reference relevant documentation or standards when helpful Be direct, accurate, and helpful. If you're unsure about something, say so rather than guessing.""" def respond(message, chat_history, enable_thinking): """ Stream a response from Kimi K2.5 via NVIDIA API. Uses Gradio's standard streaming chatbot pattern. """ if not message.strip(): yield message, chat_history return # Build conversation history in OpenAI message format messages = [{"role": "system", "content": SYSTEM_PROMPT}] # chat_history is a list of [user_msg, assistant_msg] pairs for pair in chat_history: if len(pair) >= 2 and pair[0] and pair[1]: messages.append({"role": "user", "content": pair[0]}) messages.append({"role": "assistant", "content": pair[1]}) messages.append({"role": "user", "content": message}) # Add user message to chat history immediately chat_history.append([message, ""]) try: # Call NVIDIA's OpenAI-compatible streaming endpoint kwargs = { "model": MODEL_ID, "messages": messages, "stream": True, "temperature": 1.0, "top_p": 1.0, "max_tokens": 16384, } if enable_thinking: kwargs["extra_body"] = {"chat_template_kwargs": {"thinking": True}} stream = client.chat.completions.create(**kwargs) thinking_content = "" response_content = "" in_thinking_phase = False thinking_finished = False for chunk in stream: if not chunk.choices: continue delta = chunk.choices[0].delta # Handle thinking/reasoning content if hasattr(delta, "reasoning_content") and delta.reasoning_content: if not in_thinking_phase: in_thinking_phase = True thinking_content += delta.reasoning_content # Show thinking phase in the chat display = f"🧠 **Thinking...**\n\n{thinking_content}" chat_history[-1][1] = display yield "", chat_history # Handle regular content if delta.content: if in_thinking_phase and not thinking_finished: thinking_finished = True in_thinking_phase = False response_content += delta.content if thinking_content: display = ( f"
🧠 Reasoning (click to expand)\n\n" f"{thinking_content}\n\n
\n\n" f"{response_content}" ) else: display = response_content chat_history[-1][1] = display yield "", chat_history # Final output — make sure something is there if not chat_history[-1][1]: if thinking_content: chat_history[-1][1] = f"🧠 **Reasoning:**\n\n{thinking_content}" else: chat_history[-1][1] = "No response received. Please try again." yield "", chat_history except Exception as e: error_msg = str(e) if "401" in error_msg or "auth" in error_msg.lower(): chat_history[-1][1] = "❌ **Authentication Error**: NVIDIA_API_KEY is missing or invalid. Set it in Space Secrets." elif "429" in error_msg: chat_history[-1][1] = "❌ **Rate Limit**: Too many requests. Wait a moment and try again." elif "500" in error_msg or "502" in error_msg or "503" in error_msg: chat_history[-1][1] = "❌ **Server Error**: NVIDIA API temporarily unavailable. Try again shortly." else: chat_history[-1][1] = f"❌ **Error**: {error_msg}" yield "", chat_history # ────────────────────────────────────────────────────────────── # Gradio UI — Production Coding Assistant Interface # ────────────────────────────────────────────────────────────── with gr.Blocks( theme=gr.themes.Soft( primary_hue="violet", secondary_hue="cyan", neutral_hue="zinc", font=gr.themes.GoogleFont("Inter"), ), css=""" footer { display: none !important; } """, title="Kimi K2.5 — AI Coding Assistant", ) as demo: gr.HTML("""

Kimi K2.5

AI Coding Assistant • Powered by NVIDIA H100s

1.1T Parameters 384 MoE Experts 262K Context Thinking Mode
""") enable_thinking = gr.Checkbox( value=True, label="🧠 Enable Thinking Mode (chain-of-thought reasoning)", info="When enabled, Kimi K2.5 shows its reasoning process before answering." ) chatbot = gr.Chatbot( height=500, show_copy_button=True, ) with gr.Row(): msg_input = gr.Textbox( scale=9, placeholder="Ask Kimi K2.5 to write, debug, explain, or review code...", show_label=False, lines=2, max_lines=6, ) submit_btn = gr.Button("Send", scale=1, variant="primary") gr.Examples( examples=[ "Build a REST API with authentication in Node.js", "Debug this Python function returning wrong results", "Refactor my React component for better performance", "Explain how async/await works in Rust", "Design a database schema for an e-commerce platform", "Write a CLI tool in Go that parses CSV files", ], inputs=msg_input, label="Try these examples", ) gr.HTML("""
Kimi K2.5 can make mistakes. Review generated code carefully before using in production.
Backend-swappable: NVIDIA → SiliconFlow → Moonshot — change base_url in 5 seconds.
""") # Wire up the chat using Gradio's standard streaming pattern msg_input.submit( respond, inputs=[msg_input, chatbot, enable_thinking], outputs=[msg_input, chatbot], ) submit_btn.click( respond, inputs=[msg_input, chatbot, enable_thinking], outputs=[msg_input, chatbot], ) if __name__ == "__main__": demo.launch()