Maxwell
Fix: run uvicorn directly + use root_path='' to fix Gradio UI on HF Spaces
fe302c5 verified
Raw
History Blame Contribute Delete
4.53 kB
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from typing import List, Optional
import json
import uvicorn
model_path = hf_hub_download(
repo_id="RavikxxBGamin/MinecraftAI-Chat",
filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
)
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=4,
verbose=False,
)
SYSTEM_PROMPT = (
"You are Veil, a helpful assistant made by Axion Labs, specialized in Minecraft "
"plugin development, Paper/Spigot API, and Java programming."
)
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
model: Optional[str] = "veil"
messages: List[Message]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.7
stream: Optional[bool] = False
# Gradio UI
def respond(message, history):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
try:
if history:
for item in history:
if isinstance(item, (list, tuple)) and len(item) == 2:
user_msg, assistant_msg = item
if user_msg:
messages.append({"role": "user", "content": str(user_msg)})
if assistant_msg:
messages.append({"role": "assistant", "content": str(assistant_msg)})
elif isinstance(item, dict):
if "role" in item and "content" in item:
messages.append({"role": item["role"], "content": str(item["content"])})
except Exception as e:
print("History parsing error:", e)
messages.append({"role": "user", "content": message})
stream = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
top_p=0.95,
stream=True,
)
partial = ""
for chunk in stream:
delta = chunk["choices"][0]["delta"].get("content", "")
if delta:
partial += delta
yield partial
demo = gr.ChatInterface(
fn=respond,
title="Veil v1.0.2 (Made by Axion Labs)",
description="Fine-tuned AI for Minecraft plugin dev & Java programming.",
examples=[
"Write a Paper plugin that sends a welcome message when a player joins",
"Difference between runTask and runTaskAsynchronously in Paper?",
"How do I register a command in Paper?",
"How does redstone work in Minecraft?",
],
cache_examples=False,
chatbot=gr.Chatbot(height=400, show_label=False),
)
# Create standalone FastAPI app for OpenAI-compatible endpoints
fastapi_app = FastAPI()
@fastapi_app.get("/v1/models")
def list_models():
return JSONResponse({
"object": "list",
"data": [{"id": "veil", "object": "model"}]
})
@fastapi_app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for m in req.messages:
messages.append({"role": m.role, "content": m.content})
if req.stream:
def generate():
stream = llm.create_chat_completion(
messages=messages,
max_tokens=req.max_tokens,
temperature=req.temperature,
stream=True,
)
for chunk in stream:
delta = chunk["choices"][0]["delta"].get("content", "")
if delta:
data = {
"choices": [{"delta": {"content": delta}, "finish_reason": None}]
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
response = llm.create_chat_completion(
messages=messages,
max_tokens=req.max_tokens,
temperature=req.temperature,
stream=False,
)
return JSONResponse(response)
@fastapi_app.post("/chat/completions")
def chat_completions_no_prefix(req: ChatRequest):
return chat_completions(req)
# Mount Gradio onto the FastAPI app
# root_path="" ensures static assets are served correctly on HuggingFace Spaces
app = gr.mount_gradio_app(fastapi_app, demo, path="/", root_path="")
# Start the server (runs both Gradio UI and FastAPI endpoints)
uvicorn.run(app, host="0.0.0.0", port=7860)