Spaces:

4n0s
/

rabbit

Build error

rabbit / app.py

Update app.py

10d0db2 verified about 2 months ago

1.4 kB

	import gradio as gr
	from fastapi import FastAPI, Request
	from llama_cpp import Llama
	import uvicorn
	import threading

	# 1. Load the model (Quantized for 16GB RAM limit)
	llm = Llama.from_pretrained(
	repo_id="tensorblock/WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-GGUF",
	filename="WhiteRabbitNeo-2.5-Qwen-2.5-Coder-7B-Q4_K_M.gguf",
	n_ctx=2048,
	n_threads=2
	)

	# 2. FastAPI Setup (OpenAI Wrapper)
	app = FastAPI()

	@app.post("/v1/chat/completions")
	async def chat_completions(request: Request):
	body = await request.json()
	messages = body.get("messages", [])
	prompt = f"<\|im_start\|>user\n{messages[-1]['content']}<\|im_end\|>\n<\|im_start\|>assistant\n"

	response = llm(prompt, max_tokens=512, stop=["<\|im_end\|>"])
	content = response["choices"][0]["text"]

	return {
	"choices": [{"message": {"role": "assistant", "content": content}}],
	"model": "whiterabbitneo"
	}

	# 3. Gradio Interface (Required by HF Spaces)
	def gf_chat(msg, history):
	return llm(f"<\|im_start\|>user\n{msg}<\|im_end\|>\n<\|im_start\|>assistant\n", max_tokens=512)["choices"][0]["text"]

	gui = gr.ChatInterface(fn=gf_chat)

	# 4. Launch both
	if __name__ == "__main__":
	# Run FastAPI in a background thread
	threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000}).start()
	# Run Gradio on the standard port
	gui.launch(server_port=7860)