| import gradio as gr |
| from openai import OpenAI |
| import os |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| client = OpenAI( |
| base_url="https://integrate.api.nvidia.com/v1", |
| api_key=os.getenv("NVIDIA_API_KEY") |
| ) |
|
|
| MODEL_ID = "moonshotai/kimi-k2.5" |
|
|
| SYSTEM_PROMPT = """You are Kimi K2.5, an advanced AI coding assistant built by Moonshot AI. |
| You are an expert in software engineering across all programming languages and frameworks. |
| |
| Your capabilities include: |
| - Writing, reviewing, and debugging code in any language |
| - Explaining complex programming concepts clearly |
| - Suggesting best practices, design patterns, and architectural decisions |
| - Analyzing code for performance, security, and maintainability |
| - Generating complete applications, APIs, and systems |
| - Helping with DevOps, databases, cloud infrastructure, and more |
| |
| When writing code: |
| - Always use proper code blocks with language identifiers |
| - Include comments for complex logic |
| - Follow language-specific conventions and best practices |
| - Provide complete, runnable code when possible |
| - Suggest tests when appropriate |
| |
| When explaining: |
| - Be thorough but concise |
| - Use examples to illustrate concepts |
| - Break complex topics into digestible parts |
| - Reference relevant documentation or standards when helpful |
| |
| Be direct, accurate, and helpful. If you're unsure about something, say so rather than guessing.""" |
|
|
|
|
| def respond(message, chat_history, enable_thinking): |
| """ |
| Stream a response from Kimi K2.5 via NVIDIA API. |
| Uses Gradio's standard streaming chatbot pattern. |
| """ |
| if not message.strip(): |
| yield message, chat_history |
| return |
|
|
| |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] |
|
|
| |
| for pair in chat_history: |
| if len(pair) >= 2 and pair[0] and pair[1]: |
| messages.append({"role": "user", "content": pair[0]}) |
| messages.append({"role": "assistant", "content": pair[1]}) |
|
|
| messages.append({"role": "user", "content": message}) |
|
|
| |
| chat_history.append([message, ""]) |
|
|
| try: |
| |
| kwargs = { |
| "model": MODEL_ID, |
| "messages": messages, |
| "stream": True, |
| "temperature": 1.0, |
| "top_p": 1.0, |
| "max_tokens": 16384, |
| } |
|
|
| if enable_thinking: |
| kwargs["extra_body"] = {"chat_template_kwargs": {"thinking": True}} |
|
|
| stream = client.chat.completions.create(**kwargs) |
|
|
| thinking_content = "" |
| response_content = "" |
| in_thinking_phase = False |
| thinking_finished = False |
|
|
| for chunk in stream: |
| if not chunk.choices: |
| continue |
|
|
| delta = chunk.choices[0].delta |
|
|
| |
| if hasattr(delta, "reasoning_content") and delta.reasoning_content: |
| if not in_thinking_phase: |
| in_thinking_phase = True |
| thinking_content += delta.reasoning_content |
|
|
| |
| display = f"🧠 **Thinking...**\n\n{thinking_content}" |
| chat_history[-1][1] = display |
| yield "", chat_history |
|
|
| |
| if delta.content: |
| if in_thinking_phase and not thinking_finished: |
| thinking_finished = True |
| in_thinking_phase = False |
|
|
| response_content += delta.content |
|
|
| if thinking_content: |
| display = ( |
| f"<details><summary>🧠 Reasoning (click to expand)</summary>\n\n" |
| f"{thinking_content}\n\n</details>\n\n" |
| f"{response_content}" |
| ) |
| else: |
| display = response_content |
|
|
| chat_history[-1][1] = display |
| yield "", chat_history |
|
|
| |
| if not chat_history[-1][1]: |
| if thinking_content: |
| chat_history[-1][1] = f"🧠 **Reasoning:**\n\n{thinking_content}" |
| else: |
| chat_history[-1][1] = "No response received. Please try again." |
|
|
| yield "", chat_history |
|
|
| except Exception as e: |
| error_msg = str(e) |
| if "401" in error_msg or "auth" in error_msg.lower(): |
| chat_history[-1][1] = "❌ **Authentication Error**: NVIDIA_API_KEY is missing or invalid. Set it in Space Secrets." |
| elif "429" in error_msg: |
| chat_history[-1][1] = "❌ **Rate Limit**: Too many requests. Wait a moment and try again." |
| elif "500" in error_msg or "502" in error_msg or "503" in error_msg: |
| chat_history[-1][1] = "❌ **Server Error**: NVIDIA API temporarily unavailable. Try again shortly." |
| else: |
| chat_history[-1][1] = f"❌ **Error**: {error_msg}" |
| yield "", chat_history |
|
|
|
|
| |
| |
| |
|
|
| with gr.Blocks( |
| theme=gr.themes.Soft( |
| primary_hue="violet", |
| secondary_hue="cyan", |
| neutral_hue="zinc", |
| font=gr.themes.GoogleFont("Inter"), |
| ), |
| css=""" |
| footer { display: none !important; } |
| """, |
| title="Kimi K2.5 — AI Coding Assistant", |
| ) as demo: |
|
|
| gr.HTML(""" |
| <div style="text-align: center; padding: 24px 0 16px 0;"> |
| <div style="display: inline-flex; align-items: center; gap: 12px; margin-bottom: 8px;"> |
| <div style="width: 48px; height: 48px; border-radius: 12px; background: linear-gradient(135deg, #7c3aed, #06b6d4); display: flex; align-items: center; justify-content: center; font-size: 24px; box-shadow: 0 0 24px rgba(139,92,246,0.3);">⚡</div> |
| <div style="text-align: left;"> |
| <h1 style="margin: 0; font-size: 28px; font-weight: 700; background: linear-gradient(135deg, #a78bfa, #22d3ee); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">Kimi K2.5</h1> |
| <p style="margin: 0; font-size: 13px; color: #71717a;">AI Coding Assistant • Powered by NVIDIA H100s</p> |
| </div> |
| </div> |
| <div style="display: flex; gap: 12px; justify-content: center; margin-top: 12px; flex-wrap: wrap;"> |
| <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(139,92,246,0.1); border: 1px solid rgba(139,92,246,0.2); font-size: 11px; color: #a78bfa;">1.1T Parameters</span> |
| <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2); font-size: 11px; color: #22d3ee;">384 MoE Experts</span> |
| <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(52,211,153,0.1); border: 1px solid rgba(52,211,153,0.2); font-size: 11px; color: #34d399;">262K Context</span> |
| <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(251,191,36,0.1); border: 1px solid rgba(251,191,36,0.2); font-size: 11px; color: #fbbf24;">Thinking Mode</span> |
| </div> |
| </div> |
| """) |
|
|
| enable_thinking = gr.Checkbox( |
| value=True, |
| label="🧠 Enable Thinking Mode (chain-of-thought reasoning)", |
| info="When enabled, Kimi K2.5 shows its reasoning process before answering." |
| ) |
|
|
| chatbot = gr.Chatbot( |
| height=500, |
| show_copy_button=True, |
| ) |
|
|
| with gr.Row(): |
| msg_input = gr.Textbox( |
| scale=9, |
| placeholder="Ask Kimi K2.5 to write, debug, explain, or review code...", |
| show_label=False, |
| lines=2, |
| max_lines=6, |
| ) |
| submit_btn = gr.Button("Send", scale=1, variant="primary") |
|
|
| gr.Examples( |
| examples=[ |
| "Build a REST API with authentication in Node.js", |
| "Debug this Python function returning wrong results", |
| "Refactor my React component for better performance", |
| "Explain how async/await works in Rust", |
| "Design a database schema for an e-commerce platform", |
| "Write a CLI tool in Go that parses CSV files", |
| ], |
| inputs=msg_input, |
| label="Try these examples", |
| ) |
|
|
| gr.HTML(""" |
| <div style="text-align: center; padding: 12px 0; color: #52525b; font-size: 11px;"> |
| Kimi K2.5 can make mistakes. Review generated code carefully before using in production. |
| <br>Backend-swappable: NVIDIA → SiliconFlow → Moonshot — change base_url in 5 seconds. |
| </div> |
| """) |
|
|
| |
| msg_input.submit( |
| respond, |
| inputs=[msg_input, chatbot, enable_thinking], |
| outputs=[msg_input, chatbot], |
| ) |
| submit_btn.click( |
| respond, |
| inputs=[msg_input, chatbot, enable_thinking], |
| outputs=[msg_input, chatbot], |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|