import gradio as gr
from openai import OpenAI
import os
# ──────────────────────────────────────────────────────────────
# Kimi K2.5 Production AI Coding Assistant
# Architecture: OpenAI-compatible client → NVIDIA API → Kimi K2.5
# ──────────────────────────────────────────────────────────────
# Initialize the OpenAI-compatible client pointing at NVIDIA's inference servers.
# This keeps your agent "backend-swappable" — change base_url to switch providers
# (NVIDIA → SiliconFlow → Moonshot → local vLLM) without rewriting any logic.
client = OpenAI(
base_url="https://integrate.api.nvidia.com/v1",
api_key=os.getenv("NVIDIA_API_KEY")
)
MODEL_ID = "moonshotai/kimi-k2.5"
SYSTEM_PROMPT = """You are Kimi K2.5, an advanced AI coding assistant built by Moonshot AI.
You are an expert in software engineering across all programming languages and frameworks.
Your capabilities include:
- Writing, reviewing, and debugging code in any language
- Explaining complex programming concepts clearly
- Suggesting best practices, design patterns, and architectural decisions
- Analyzing code for performance, security, and maintainability
- Generating complete applications, APIs, and systems
- Helping with DevOps, databases, cloud infrastructure, and more
When writing code:
- Always use proper code blocks with language identifiers
- Include comments for complex logic
- Follow language-specific conventions and best practices
- Provide complete, runnable code when possible
- Suggest tests when appropriate
When explaining:
- Be thorough but concise
- Use examples to illustrate concepts
- Break complex topics into digestible parts
- Reference relevant documentation or standards when helpful
Be direct, accurate, and helpful. If you're unsure about something, say so rather than guessing."""
def respond(message, chat_history, enable_thinking):
"""
Stream a response from Kimi K2.5 via NVIDIA API.
Uses Gradio's standard streaming chatbot pattern.
"""
if not message.strip():
yield message, chat_history
return
# Build conversation history in OpenAI message format
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
# chat_history is a list of [user_msg, assistant_msg] pairs
for pair in chat_history:
if len(pair) >= 2 and pair[0] and pair[1]:
messages.append({"role": "user", "content": pair[0]})
messages.append({"role": "assistant", "content": pair[1]})
messages.append({"role": "user", "content": message})
# Add user message to chat history immediately
chat_history.append([message, ""])
try:
# Call NVIDIA's OpenAI-compatible streaming endpoint
kwargs = {
"model": MODEL_ID,
"messages": messages,
"stream": True,
"temperature": 1.0,
"top_p": 1.0,
"max_tokens": 16384,
}
if enable_thinking:
kwargs["extra_body"] = {"chat_template_kwargs": {"thinking": True}}
stream = client.chat.completions.create(**kwargs)
thinking_content = ""
response_content = ""
in_thinking_phase = False
thinking_finished = False
for chunk in stream:
if not chunk.choices:
continue
delta = chunk.choices[0].delta
# Handle thinking/reasoning content
if hasattr(delta, "reasoning_content") and delta.reasoning_content:
if not in_thinking_phase:
in_thinking_phase = True
thinking_content += delta.reasoning_content
# Show thinking phase in the chat
display = f"🧠 **Thinking...**\n\n{thinking_content}"
chat_history[-1][1] = display
yield "", chat_history
# Handle regular content
if delta.content:
if in_thinking_phase and not thinking_finished:
thinking_finished = True
in_thinking_phase = False
response_content += delta.content
if thinking_content:
display = (
f"🧠 Reasoning (click to expand)
\n\n"
f"{thinking_content}\n\n
AI Coding Assistant • Powered by NVIDIA H100s