File size: 4,527 Bytes
a134bd7
 
 
54f6619
a134bd7
 
 
 
fe302c5
a134bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe302c5
54f6619
a134bd7
 
54f6619
a134bd7
 
 
 
 
 
 
54f6619
a134bd7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54f6619
a134bd7
 
 
 
fe302c5
 
54f6619
fe302c5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from fastapi import FastAPI
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from typing import List, Optional
import json
import uvicorn

model_path = hf_hub_download(
    repo_id="RavikxxBGamin/MinecraftAI-Chat",
    filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
)

llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=4,
    verbose=False,
)

SYSTEM_PROMPT = (
    "You are Veil, a helpful assistant made by Axion Labs, specialized in Minecraft "
    "plugin development, Paper/Spigot API, and Java programming."
)


class Message(BaseModel):
    role: str
    content: str


class ChatRequest(BaseModel):
    model: Optional[str] = "veil"
    messages: List[Message]
    max_tokens: Optional[int] = 512
    temperature: Optional[float] = 0.7
    stream: Optional[bool] = False


# Gradio UI
def respond(message, history):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    try:
        if history:
            for item in history:
                if isinstance(item, (list, tuple)) and len(item) == 2:
                    user_msg, assistant_msg = item
                    if user_msg:
                        messages.append({"role": "user", "content": str(user_msg)})
                    if assistant_msg:
                        messages.append({"role": "assistant", "content": str(assistant_msg)})
                elif isinstance(item, dict):
                    if "role" in item and "content" in item:
                        messages.append({"role": item["role"], "content": str(item["content"])})
    except Exception as e:
        print("History parsing error:", e)

    messages.append({"role": "user", "content": message})

    stream = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.95,
        stream=True,
    )

    partial = ""
    for chunk in stream:
        delta = chunk["choices"][0]["delta"].get("content", "")
        if delta:
            partial += delta
            yield partial


demo = gr.ChatInterface(
    fn=respond,
    title="Veil v1.0.2 (Made by Axion Labs)",
    description="Fine-tuned AI for Minecraft plugin dev & Java programming.",
    examples=[
        "Write a Paper plugin that sends a welcome message when a player joins",
        "Difference between runTask and runTaskAsynchronously in Paper?",
        "How do I register a command in Paper?",
        "How does redstone work in Minecraft?",
    ],
    cache_examples=False,
    chatbot=gr.Chatbot(height=400, show_label=False),
)

# Create standalone FastAPI app for OpenAI-compatible endpoints
fastapi_app = FastAPI()


@fastapi_app.get("/v1/models")
def list_models():
    return JSONResponse({
        "object": "list",
        "data": [{"id": "veil", "object": "model"}]
    })


@fastapi_app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for m in req.messages:
        messages.append({"role": m.role, "content": m.content})

    if req.stream:
        def generate():
            stream = llm.create_chat_completion(
                messages=messages,
                max_tokens=req.max_tokens,
                temperature=req.temperature,
                stream=True,
            )
            for chunk in stream:
                delta = chunk["choices"][0]["delta"].get("content", "")
                if delta:
                    data = {
                        "choices": [{"delta": {"content": delta}, "finish_reason": None}]
                    }
                    yield f"data: {json.dumps(data)}\n\n"
            yield "data: [DONE]\n\n"
        return StreamingResponse(generate(), media_type="text/event-stream")
    else:
        response = llm.create_chat_completion(
            messages=messages,
            max_tokens=req.max_tokens,
            temperature=req.temperature,
            stream=False,
        )
        return JSONResponse(response)


@fastapi_app.post("/chat/completions")
def chat_completions_no_prefix(req: ChatRequest):
    return chat_completions(req)


# Mount Gradio onto the FastAPI app
# root_path="" ensures static assets are served correctly on HuggingFace Spaces
app = gr.mount_gradio_app(fastapi_app, demo, path="/", root_path="")

# Start the server (runs both Gradio UI and FastAPI endpoints)
uvicorn.run(app, host="0.0.0.0", port=7860)