File size: 4,527 Bytes
a134bd7 54f6619 a134bd7 fe302c5 a134bd7 fe302c5 54f6619 a134bd7 54f6619 a134bd7 54f6619 a134bd7 54f6619 a134bd7 fe302c5 54f6619 fe302c5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 | import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from typing import List, Optional
import json
import uvicorn
model_path = hf_hub_download(
repo_id="RavikxxBGamin/MinecraftAI-Chat",
filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
)
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=4,
verbose=False,
)
SYSTEM_PROMPT = (
"You are Veil, a helpful assistant made by Axion Labs, specialized in Minecraft "
"plugin development, Paper/Spigot API, and Java programming."
)
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
model: Optional[str] = "veil"
messages: List[Message]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.7
stream: Optional[bool] = False
# Gradio UI
def respond(message, history):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
try:
if history:
for item in history:
if isinstance(item, (list, tuple)) and len(item) == 2:
user_msg, assistant_msg = item
if user_msg:
messages.append({"role": "user", "content": str(user_msg)})
if assistant_msg:
messages.append({"role": "assistant", "content": str(assistant_msg)})
elif isinstance(item, dict):
if "role" in item and "content" in item:
messages.append({"role": item["role"], "content": str(item["content"])})
except Exception as e:
print("History parsing error:", e)
messages.append({"role": "user", "content": message})
stream = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
top_p=0.95,
stream=True,
)
partial = ""
for chunk in stream:
delta = chunk["choices"][0]["delta"].get("content", "")
if delta:
partial += delta
yield partial
demo = gr.ChatInterface(
fn=respond,
title="Veil v1.0.2 (Made by Axion Labs)",
description="Fine-tuned AI for Minecraft plugin dev & Java programming.",
examples=[
"Write a Paper plugin that sends a welcome message when a player joins",
"Difference between runTask and runTaskAsynchronously in Paper?",
"How do I register a command in Paper?",
"How does redstone work in Minecraft?",
],
cache_examples=False,
chatbot=gr.Chatbot(height=400, show_label=False),
)
# Create standalone FastAPI app for OpenAI-compatible endpoints
fastapi_app = FastAPI()
@fastapi_app.get("/v1/models")
def list_models():
return JSONResponse({
"object": "list",
"data": [{"id": "veil", "object": "model"}]
})
@fastapi_app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest):
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for m in req.messages:
messages.append({"role": m.role, "content": m.content})
if req.stream:
def generate():
stream = llm.create_chat_completion(
messages=messages,
max_tokens=req.max_tokens,
temperature=req.temperature,
stream=True,
)
for chunk in stream:
delta = chunk["choices"][0]["delta"].get("content", "")
if delta:
data = {
"choices": [{"delta": {"content": delta}, "finish_reason": None}]
}
yield f"data: {json.dumps(data)}\n\n"
yield "data: [DONE]\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
else:
response = llm.create_chat_completion(
messages=messages,
max_tokens=req.max_tokens,
temperature=req.temperature,
stream=False,
)
return JSONResponse(response)
@fastapi_app.post("/chat/completions")
def chat_completions_no_prefix(req: ChatRequest):
return chat_completions(req)
# Mount Gradio onto the FastAPI app
# root_path="" ensures static assets are served correctly on HuggingFace Spaces
app = gr.mount_gradio_app(fastapi_app, demo, path="/", root_path="")
# Start the server (runs both Gradio UI and FastAPI endpoints)
uvicorn.run(app, host="0.0.0.0", port=7860)
|