| import gradio as gr |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
| from fastapi import FastAPI |
| from fastapi.responses import StreamingResponse, JSONResponse |
| from pydantic import BaseModel |
| from typing import List, Optional |
| import json |
| import uvicorn |
|
|
| model_path = hf_hub_download( |
| repo_id="RavikxxBGamin/MinecraftAI-Chat", |
| filename="llama-3.2-3b-instruct.Q4_K_M.gguf", |
| ) |
|
|
| llm = Llama( |
| model_path=model_path, |
| n_ctx=4096, |
| n_threads=4, |
| verbose=False, |
| ) |
|
|
| SYSTEM_PROMPT = ( |
| "You are Veil, a helpful assistant made by Axion Labs, specialized in Minecraft " |
| "plugin development, Paper/Spigot API, and Java programming." |
| ) |
|
|
|
|
| class Message(BaseModel): |
| role: str |
| content: str |
|
|
|
|
| class ChatRequest(BaseModel): |
| model: Optional[str] = "veil" |
| messages: List[Message] |
| max_tokens: Optional[int] = 512 |
| temperature: Optional[float] = 0.7 |
| stream: Optional[bool] = False |
|
|
|
|
| |
| def respond(message, history): |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] |
| try: |
| if history: |
| for item in history: |
| if isinstance(item, (list, tuple)) and len(item) == 2: |
| user_msg, assistant_msg = item |
| if user_msg: |
| messages.append({"role": "user", "content": str(user_msg)}) |
| if assistant_msg: |
| messages.append({"role": "assistant", "content": str(assistant_msg)}) |
| elif isinstance(item, dict): |
| if "role" in item and "content" in item: |
| messages.append({"role": item["role"], "content": str(item["content"])}) |
| except Exception as e: |
| print("History parsing error:", e) |
|
|
| messages.append({"role": "user", "content": message}) |
|
|
| stream = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=512, |
| temperature=0.7, |
| top_p=0.95, |
| stream=True, |
| ) |
|
|
| partial = "" |
| for chunk in stream: |
| delta = chunk["choices"][0]["delta"].get("content", "") |
| if delta: |
| partial += delta |
| yield partial |
|
|
|
|
| demo = gr.ChatInterface( |
| fn=respond, |
| title="Veil v1.0.2 (Made by Axion Labs)", |
| description="Fine-tuned AI for Minecraft plugin dev & Java programming.", |
| examples=[ |
| "Write a Paper plugin that sends a welcome message when a player joins", |
| "Difference between runTask and runTaskAsynchronously in Paper?", |
| "How do I register a command in Paper?", |
| "How does redstone work in Minecraft?", |
| ], |
| cache_examples=False, |
| chatbot=gr.Chatbot(height=400, show_label=False), |
| ) |
|
|
| |
| fastapi_app = FastAPI() |
|
|
|
|
| @fastapi_app.get("/v1/models") |
| def list_models(): |
| return JSONResponse({ |
| "object": "list", |
| "data": [{"id": "veil", "object": "model"}] |
| }) |
|
|
|
|
| @fastapi_app.post("/v1/chat/completions") |
| def chat_completions(req: ChatRequest): |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] |
| for m in req.messages: |
| messages.append({"role": m.role, "content": m.content}) |
|
|
| if req.stream: |
| def generate(): |
| stream = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=req.max_tokens, |
| temperature=req.temperature, |
| stream=True, |
| ) |
| for chunk in stream: |
| delta = chunk["choices"][0]["delta"].get("content", "") |
| if delta: |
| data = { |
| "choices": [{"delta": {"content": delta}, "finish_reason": None}] |
| } |
| yield f"data: {json.dumps(data)}\n\n" |
| yield "data: [DONE]\n\n" |
| return StreamingResponse(generate(), media_type="text/event-stream") |
| else: |
| response = llm.create_chat_completion( |
| messages=messages, |
| max_tokens=req.max_tokens, |
| temperature=req.temperature, |
| stream=False, |
| ) |
| return JSONResponse(response) |
|
|
|
|
| @fastapi_app.post("/chat/completions") |
| def chat_completions_no_prefix(req: ChatRequest): |
| return chat_completions(req) |
|
|
|
|
| |
| |
| app = gr.mount_gradio_app(fastapi_app, demo, path="/", root_path="") |
|
|
| |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|