import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download from fastapi import FastAPI from fastapi.responses import StreamingResponse, JSONResponse from pydantic import BaseModel from typing import List, Optional import json import uvicorn model_path = hf_hub_download( repo_id="RavikxxBGamin/MinecraftAI-Chat", filename="llama-3.2-3b-instruct.Q4_K_M.gguf", ) llm = Llama( model_path=model_path, n_ctx=4096, n_threads=4, verbose=False, ) SYSTEM_PROMPT = ( "You are Veil, a helpful assistant made by Axion Labs, specialized in Minecraft " "plugin development, Paper/Spigot API, and Java programming." ) class Message(BaseModel): role: str content: str class ChatRequest(BaseModel): model: Optional[str] = "veil" messages: List[Message] max_tokens: Optional[int] = 512 temperature: Optional[float] = 0.7 stream: Optional[bool] = False # Gradio UI def respond(message, history): messages = [{"role": "system", "content": SYSTEM_PROMPT}] try: if history: for item in history: if isinstance(item, (list, tuple)) and len(item) == 2: user_msg, assistant_msg = item if user_msg: messages.append({"role": "user", "content": str(user_msg)}) if assistant_msg: messages.append({"role": "assistant", "content": str(assistant_msg)}) elif isinstance(item, dict): if "role" in item and "content" in item: messages.append({"role": item["role"], "content": str(item["content"])}) except Exception as e: print("History parsing error:", e) messages.append({"role": "user", "content": message}) stream = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, top_p=0.95, stream=True, ) partial = "" for chunk in stream: delta = chunk["choices"][0]["delta"].get("content", "") if delta: partial += delta yield partial demo = gr.ChatInterface( fn=respond, title="Veil v1.0.2 (Made by Axion Labs)", description="Fine-tuned AI for Minecraft plugin dev & Java programming.", examples=[ "Write a Paper plugin that sends a welcome message when a player joins", "Difference between runTask and runTaskAsynchronously in Paper?", "How do I register a command in Paper?", "How does redstone work in Minecraft?", ], cache_examples=False, chatbot=gr.Chatbot(height=400, show_label=False), ) # Create standalone FastAPI app for OpenAI-compatible endpoints fastapi_app = FastAPI() @fastapi_app.get("/v1/models") def list_models(): return JSONResponse({ "object": "list", "data": [{"id": "veil", "object": "model"}] }) @fastapi_app.post("/v1/chat/completions") def chat_completions(req: ChatRequest): messages = [{"role": "system", "content": SYSTEM_PROMPT}] for m in req.messages: messages.append({"role": m.role, "content": m.content}) if req.stream: def generate(): stream = llm.create_chat_completion( messages=messages, max_tokens=req.max_tokens, temperature=req.temperature, stream=True, ) for chunk in stream: delta = chunk["choices"][0]["delta"].get("content", "") if delta: data = { "choices": [{"delta": {"content": delta}, "finish_reason": None}] } yield f"data: {json.dumps(data)}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(generate(), media_type="text/event-stream") else: response = llm.create_chat_completion( messages=messages, max_tokens=req.max_tokens, temperature=req.temperature, stream=False, ) return JSONResponse(response) @fastapi_app.post("/chat/completions") def chat_completions_no_prefix(req: ChatRequest): return chat_completions(req) # Mount Gradio onto the FastAPI app # root_path="" ensures static assets are served correctly on HuggingFace Spaces app = gr.mount_gradio_app(fastapi_app, demo, path="/", root_path="") # Start the server (runs both Gradio UI and FastAPI endpoints) uvicorn.run(app, host="0.0.0.0", port=7860)