import os
import time
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI
from pydantic import BaseModel
from threading import Thread
import uvicorn

# ----------------------------
# Model
# ----------------------------
model_path = hf_hub_download(
    repo_id="khazarai/Qwen3-4B-Qwen3.6-plus-Reasoning-Distilled-GGUF",
    filename="Qwen3-4B-Thinking-2507.Q4_1.gguf"
)

llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=os.cpu_count(),
    n_batch=512,
    n_gpu_layers=0,
    verbose=False
)

llm("warmup", max_tokens=1)

# ----------------------------
# System Prompt
# ----------------------------
SYSTEM_PROMPT = """
You are an advanced AI assistant.
Answer questions clearly and concisely.
You can handle multi-turn conversations and provide detailed responses if needed.
"""

# ----------------------------
# Chat Function
# ----------------------------
def generate_response(message, history):
    yield "🤖 Thinking..."
    time.sleep(0.5)

    prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
    for h in history:
        if isinstance(h, dict) and "role" in h and "message" in h:
            role = h["role"]
            msg = h["message"]
            if role == "user":
                prompt += f"<|im_start|>user\n{msg}<|im_end|>\n"
            else:
                prompt += f"<|im_start|>assistant\n{msg}<|im_end|>\n"
        elif isinstance(h, (list, tuple)) and len(h) >= 2:
            u, a = h[0], h[1]
            prompt += f"<|im_start|>user\n{u}<|im_end|>\n<|im_start|>assistant\n{a}<|im_end|>\n"

    prompt += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"

    output = ""
    for token in llm(prompt, max_tokens=2048, temperature=0.2, top_p=0.9, repeat_penalty=1.1, stream=True):
        output += token["choices"][0]["text"]
        yield output

# ----------------------------
# FastAPI API
# ----------------------------
app = FastAPI()

class ChatRequest(BaseModel):
    message: str
    history: list = []

# FastAPI endpoint ko simple banayein taaki Web Search client connect kar sake
@app.post("/generate_response") # Path ko match karne ke liye change kiya
def chat_endpoint(request: ChatRequest):
    output = ""
    # Prompt logic (Same as yours)
    prompt = f"<|im_start|>system\n{SYSTEM_PROMPT}<|im_end|>\n"
    # ... baki prompt logic ...
    
    # Streaming off rakhein API response ke liye taaki ek baar mein pura text mile
    res = llm(prompt, max_tokens=1024, temperature=0.3) 
    return res["choices"][0]["text"]

# ----------------------------
# Gradio UI
# ----------------------------
with gr.Blocks(theme=gr.Theme.from_hub("JackismyShephard/ultimate-rvc-theme")) as demo:
    gr.HTML("<h2 style='text-align:center; color:white;'>Code Explainer AI</h2>")

    chatbot = gr.ChatInterface(
        fn=generate_response,
        chatbot=gr.Chatbot(height=600),
        textbox=gr.Textbox(placeholder="Paste code or ask for explanation...", container=False)
    )

    # Rounded corners for main container
    demo.css = """
    .gradio-container {
        border-radius: 25px !important;
        max-width: 600px !important;
        margin: auto !important;
        overflow: hidden;
    }
    .message.user { border-radius: 18px 18px 4px 18px !important; }
    .message.bot { border-radius: 18px 18px 18px 4px !important; }
    """

# ----------------------------
# Run Gradio + FastAPI together
# ----------------------------
def run_gradio():
    demo.launch(server_name="0.0.0.0", server_port=7860)

thread = Thread(target=run_gradio)
thread.start()

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)