import gradio as gr from fastapi import FastAPI, Request from llama_cpp import Llama import uvicorn import threading # 1. Load the model (Quantized for 16GB RAM limit) llm = Llama.from_pretrained( repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF", filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf", n_ctx=2048, n_threads=2 ) # 2. FastAPI Setup (OpenAI Wrapper) app = FastAPI() @app.post("/v1/chat/completions") async def chat_completions(request: Request): body = await request.json() messages = body.get("messages", []) prompt = f"<|im_start|>user\n{messages[-1]['content']}<|im_end|>\n<|im_start|>assistant\n" response = llm(prompt, max_tokens=512, stop=["<|im_end|>"]) content = response["choices"][0]["text"] return { "choices": [{"message": {"role": "assistant", "content": content}}], "model": "whiterabbitneo" } # 3. Gradio Interface (Required by HF Spaces) def gf_chat(msg, history): return llm(f"<|im_start|>user\n{msg}<|im_end|>\n<|im_start|>assistant\n", max_tokens=512)["choices"][0]["text"] gui = gr.ChatInterface(fn=gf_chat) # 4. Launch both if __name__ == "__main__": # Run FastAPI in a background thread threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000}).start() # Run Gradio on the standard port gui.launch(server_port=7860)