import gradio as gr
from fastapi import FastAPI, Request
from llama_cpp import Llama
import uvicorn
import threading

# 1. Load the model (Quantized for 16GB RAM limit)
llm = Llama.from_pretrained(
    repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF",
    filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf",
    n_ctx=2048,
    n_threads=2
)

# 2. FastAPI Setup (OpenAI Wrapper)
app = FastAPI()

@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
    body = await request.json()
    messages = body.get("messages", [])
    prompt = f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n"
    
    response = llm(prompt, max_tokens=512, stop=["<|im_end|>"])
    content = response["choices"][0]["text"]
    
    return {
        "choices": [{"message": {"role": "assistant", "content": content}}],
        "model": "whiterabbitneo"
    }

# 3. Gradio Interface (Required by HF Spaces)
def gf_chat(msg, history):
    return llm(f"<|im_start|>user\n{msg}<|im_end|>\n<|im_start|>assistant\n", max_tokens=512)["choices"][0]["text"]

gui = gr.ChatInterface(fn=gf_chat)

# 4. Launch both
if __name__ == "__main__":
    # Run FastAPI in a background thread
    threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000}).start()
    # Run Gradio on the standard port
    gui.launch(server_port=7860)