import gradio as gr
from openai import OpenAI
import os

# ──────────────────────────────────────────────────────────────
# Kimi K2.5 Production AI Coding Assistant
# Architecture: OpenAI-compatible client → NVIDIA API → Kimi K2.5
# ──────────────────────────────────────────────────────────────

# Initialize the OpenAI-compatible client pointing at NVIDIA's inference servers.
# This keeps your agent "backend-swappable" — change base_url to switch providers
# (NVIDIA → SiliconFlow → Moonshot → local vLLM) without rewriting any logic.
client = OpenAI(
    base_url="https://integrate.api.nvidia.com/v1",
    api_key=os.getenv("NVIDIA_API_KEY")
)

MODEL_ID = "moonshotai/kimi-k2.5"

SYSTEM_PROMPT = """You are Kimi K2.5, an advanced AI coding assistant built by Moonshot AI.
You are an expert in software engineering across all programming languages and frameworks.

Your capabilities include:
- Writing, reviewing, and debugging code in any language
- Explaining complex programming concepts clearly
- Suggesting best practices, design patterns, and architectural decisions
- Analyzing code for performance, security, and maintainability
- Generating complete applications, APIs, and systems
- Helping with DevOps, databases, cloud infrastructure, and more

When writing code:
- Always use proper code blocks with language identifiers
- Include comments for complex logic
- Follow language-specific conventions and best practices
- Provide complete, runnable code when possible
- Suggest tests when appropriate

When explaining:
- Be thorough but concise
- Use examples to illustrate concepts
- Break complex topics into digestible parts
- Reference relevant documentation or standards when helpful

Be direct, accurate, and helpful. If you're unsure about something, say so rather than guessing."""


def respond(message, chat_history, enable_thinking):
    """
    Stream a response from Kimi K2.5 via NVIDIA API.
    Uses Gradio's standard streaming chatbot pattern.
    """
    if not message.strip():
        yield message, chat_history
        return

    # Build conversation history in OpenAI message format
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]

    # chat_history is a list of [user_msg, assistant_msg] pairs
    for pair in chat_history:
        if len(pair) >= 2 and pair[0] and pair[1]:
            messages.append({"role": "user", "content": pair[0]})
            messages.append({"role": "assistant", "content": pair[1]})

    messages.append({"role": "user", "content": message})

    # Add user message to chat history immediately
    chat_history.append([message, ""])

    try:
        # Call NVIDIA's OpenAI-compatible streaming endpoint
        kwargs = {
            "model": MODEL_ID,
            "messages": messages,
            "stream": True,
            "temperature": 1.0,
            "top_p": 1.0,
            "max_tokens": 16384,
        }

        if enable_thinking:
            kwargs["extra_body"] = {"chat_template_kwargs": {"thinking": True}}

        stream = client.chat.completions.create(**kwargs)

        thinking_content = ""
        response_content = ""
        in_thinking_phase = False
        thinking_finished = False

        for chunk in stream:
            if not chunk.choices:
                continue

            delta = chunk.choices[0].delta

            # Handle thinking/reasoning content
            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
                if not in_thinking_phase:
                    in_thinking_phase = True
                thinking_content += delta.reasoning_content

                # Show thinking phase in the chat
                display = f"🧠 **Thinking...**\n\n{thinking_content}"
                chat_history[-1][1] = display
                yield "", chat_history

            # Handle regular content
            if delta.content:
                if in_thinking_phase and not thinking_finished:
                    thinking_finished = True
                    in_thinking_phase = False

                response_content += delta.content

                if thinking_content:
                    display = (
                        f"<details><summary>🧠 Reasoning (click to expand)</summary>\n\n"
                        f"{thinking_content}\n\n</details>\n\n"
                        f"{response_content}"
                    )
                else:
                    display = response_content

                chat_history[-1][1] = display
                yield "", chat_history

        # Final output — make sure something is there
        if not chat_history[-1][1]:
            if thinking_content:
                chat_history[-1][1] = f"🧠 **Reasoning:**\n\n{thinking_content}"
            else:
                chat_history[-1][1] = "No response received. Please try again."

        yield "", chat_history

    except Exception as e:
        error_msg = str(e)
        if "401" in error_msg or "auth" in error_msg.lower():
            chat_history[-1][1] = "❌ **Authentication Error**: NVIDIA_API_KEY is missing or invalid. Set it in Space Secrets."
        elif "429" in error_msg:
            chat_history[-1][1] = "❌ **Rate Limit**: Too many requests. Wait a moment and try again."
        elif "500" in error_msg or "502" in error_msg or "503" in error_msg:
            chat_history[-1][1] = "❌ **Server Error**: NVIDIA API temporarily unavailable. Try again shortly."
        else:
            chat_history[-1][1] = f"❌ **Error**: {error_msg}"
        yield "", chat_history


# ──────────────────────────────────────────────────────────────
# Gradio UI — Production Coding Assistant Interface
# ──────────────────────────────────────────────────────────────

with gr.Blocks(
    theme=gr.themes.Soft(
        primary_hue="violet",
        secondary_hue="cyan",
        neutral_hue="zinc",
        font=gr.themes.GoogleFont("Inter"),
    ),
    css="""
        footer { display: none !important; }
    """,
    title="Kimi K2.5 — AI Coding Assistant",
) as demo:

    gr.HTML("""
        <div style="text-align: center; padding: 24px 0 16px 0;">
            <div style="display: inline-flex; align-items: center; gap: 12px; margin-bottom: 8px;">
                <div style="width: 48px; height: 48px; border-radius: 12px; background: linear-gradient(135deg, #7c3aed, #06b6d4); display: flex; align-items: center; justify-content: center; font-size: 24px; box-shadow: 0 0 24px rgba(139,92,246,0.3);">⚡</div>
                <div style="text-align: left;">
                    <h1 style="margin: 0; font-size: 28px; font-weight: 700; background: linear-gradient(135deg, #a78bfa, #22d3ee); -webkit-background-clip: text; -webkit-text-fill-color: transparent;">Kimi K2.5</h1>
                    <p style="margin: 0; font-size: 13px; color: #71717a;">AI Coding Assistant • Powered by NVIDIA H100s</p>
                </div>
            </div>
            <div style="display: flex; gap: 12px; justify-content: center; margin-top: 12px; flex-wrap: wrap;">
                <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(139,92,246,0.1); border: 1px solid rgba(139,92,246,0.2); font-size: 11px; color: #a78bfa;">1.1T Parameters</span>
                <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(34,211,238,0.1); border: 1px solid rgba(34,211,238,0.2); font-size: 11px; color: #22d3ee;">384 MoE Experts</span>
                <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(52,211,153,0.1); border: 1px solid rgba(52,211,153,0.2); font-size: 11px; color: #34d399;">262K Context</span>
                <span style="padding: 4px 12px; border-radius: 9999px; background: rgba(251,191,36,0.1); border: 1px solid rgba(251,191,36,0.2); font-size: 11px; color: #fbbf24;">Thinking Mode</span>
            </div>
        </div>
    """)

    enable_thinking = gr.Checkbox(
        value=True,
        label="🧠 Enable Thinking Mode (chain-of-thought reasoning)",
        info="When enabled, Kimi K2.5 shows its reasoning process before answering."
    )

    chatbot = gr.Chatbot(
        height=500,
        show_copy_button=True,
    )

    with gr.Row():
        msg_input = gr.Textbox(
            scale=9,
            placeholder="Ask Kimi K2.5 to write, debug, explain, or review code...",
            show_label=False,
            lines=2,
            max_lines=6,
        )
        submit_btn = gr.Button("Send", scale=1, variant="primary")

    gr.Examples(
        examples=[
            "Build a REST API with authentication in Node.js",
            "Debug this Python function returning wrong results",
            "Refactor my React component for better performance",
            "Explain how async/await works in Rust",
            "Design a database schema for an e-commerce platform",
            "Write a CLI tool in Go that parses CSV files",
        ],
        inputs=msg_input,
        label="Try these examples",
    )

    gr.HTML("""
        <div style="text-align: center; padding: 12px 0; color: #52525b; font-size: 11px;">
            Kimi K2.5 can make mistakes. Review generated code carefully before using in production.
            <br>Backend-swappable: NVIDIA → SiliconFlow → Moonshot — change base_url in 5 seconds.
        </div>
    """)

    # Wire up the chat using Gradio's standard streaming pattern
    msg_input.submit(
        respond,
        inputs=[msg_input, chatbot, enable_thinking],
        outputs=[msg_input, chatbot],
    )
    submit_btn.click(
        respond,
        inputs=[msg_input, chatbot, enable_thinking],
        outputs=[msg_input, chatbot],
    )


if __name__ == "__main__":
    demo.launch()