import json
import traceback

import gradio as gr
import uvicorn
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from gradio import mount_gradio_app
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# --- КОНФИГУРАЦИЯ ---
REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
FILENAME = "qwen2.5-coder-7b-instruct-q5_k_m.gguf"
CONTEXT_SIZE = 8192
DEFAULT_MAX_TOKENS = 4096

print(f"Loading model {REPO_ID}...")
try:
    model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
    llm = Llama(
        model_path=model_path,
        n_ctx=CONTEXT_SIZE,
        n_threads=2,
        n_batch=512,
        verbose=True,
    )
except Exception as e:
    print(f"Critical Error: {e}")
    llm = None

# --- API (FastAPI) ---
app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    if not llm:
        return JSONResponse(content={"error": "Model not loaded"}, status_code=500)

    try:
        data = await request.json()
        messages = data.get("messages", [])
        stream = data.get("stream", False)
        temperature = data.get("temperature", 0.4)
        max_tokens = data.get("max_tokens", DEFAULT_MAX_TOKENS)

        output = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            stream=stream,
        )

        if stream:

            def iter_content():
                for chunk in output:
                    yield f"data: {json.dumps(chunk)}\n\n"
                yield "data: [DONE]\n\n"

            return StreamingResponse(iter_content(), media_type="text/event-stream")

        return JSONResponse(content=output)
    except Exception as e:
        return JSONResponse(content={"error": str(e)}, status_code=500)


# --- ЛОГИКА ГЕНЕРАЦИИ ДЛЯ GRADIO ---


def user_input(user_message, history):
    return "", history + [[user_message, None]]


def bot_response(history, system_prompt, temperature, max_tokens):
    if not llm:
        history[-1][1] = "Error: Model failed to load. Check logs."
        yield history
        return

    # Конвертируем историю Gradio (списки) в формат Llama (словари)
    messages = [{"role": "system", "content": system_prompt}]

    # Берем последние 10 диалогов для контекста
    relevant_history = history[-11:-1] if len(history) > 1 else []

    for user_msg, assistant_msg in relevant_history:
        if user_msg:
            messages.append({"role": "user", "content": user_msg})
        if assistant_msg:
            messages.append({"role": "assistant", "content": assistant_msg})

    # Добавляем последнее сообщение пользователя
    last_user_msg = history[-1][0]
    messages.append({"role": "user", "content": last_user_msg})

    partial_text = ""
    try:
        stream = llm.create_chat_completion(
            messages=messages,
            max_tokens=int(max_tokens),
            temperature=float(temperature),
            stream=True,
        )

        for chunk in stream:
            delta = chunk["choices"][0]["delta"]
            if "content" in delta:
                partial_text += delta["content"]
                # Обновляем последнее сообщение ассистента в истории (классический формат)
                history[-1][1] = partial_text
                yield history

    except Exception as e:
        traceback.print_exc()
        history[-1][1] = partial_text + f"\n\n❌ **Error:** {str(e)}"
        yield history


# --- ИНТЕРФЕЙС (Gradio Blocks) ---

custom_css = """
#chatbot {
    height: 70vh !important;
    overflow: auto;
}
"""

theme = gr.themes.Soft(primary_hue="blue", text_size="lg")

with gr.Blocks(theme=theme, css=custom_css, title="Qwen Coder Pro") as demo:
    gr.Markdown("# 💻 Qwen 2.5 Coder Assistant")

    with gr.Row():
        # Настройки
        with gr.Column(scale=1, min_width=250):
            gr.Markdown("### ⚙️ Settings")
            system_prompt = gr.Textbox(
                label="System Prompt",
                value="You are an expert coding assistant. Write clean code.",
                lines=3,
            )
            temperature = gr.Slider(0.0, 1.0, value=0.4, label="Temperature")
            max_tokens = gr.Slider(512, 8192, value=4096, label="Max Tokens")
            clear_btn = gr.Button("🗑️ Clear Chat")

        # Чат
        with gr.Column(scale=4):
            # ВАЖНО: Убрали type="messages", используем стандартный формат
            chatbot = gr.Chatbot(
                label="Conversation",
                elem_id="chatbot",
                show_copy_button=True,  # Требует gradio>=3.37 (см. requirements.txt)
                avatar_images=(None, "https://api.iconify.design/noto:robot.svg"),
            )

            msg = gr.Textbox(
                show_label=False, placeholder="Type your code question here...", lines=2
            )
            submit_btn = gr.Button("Run ➤", variant="primary")

    # Связка событий
    msg.submit(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
    )
    submit_btn.click(user_input, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot_response, [chatbot, system_prompt, temperature, max_tokens], chatbot
    )
    clear_btn.click(lambda: None, None, chatbot, queue=False)

app = mount_gradio_app(app, demo, path="/")

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)