quan3s commited on
Commit
380bdfa
Β·
verified Β·
1 Parent(s): a60b26b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Minecraft Bot LLM Backend - FastAPI Server
3
+ OpenAI-compatible API endpoint with Bearer Token authentication
4
+ """
5
+
6
+ import os
7
+ import time
8
+ import uuid
9
+ import logging
10
+ from contextlib import asynccontextmanager
11
+ from typing import List, Optional, Union
12
+
13
+ from fastapi import FastAPI, HTTPException, Depends, Request
14
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ from pydantic import BaseModel
17
+ import uvicorn
18
+
19
+ # ─── Logging ───────────────────────────────────────────────────────────────────
20
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ─── Config ────────────────────────────────────────────────────────────────────
24
+ BEARER_TOKEN = os.environ.get("BEARER_TOKEN", "")
25
+ MODEL_PATH = os.environ.get("MODEL_PATH", "/app/models/model.gguf")
26
+ MODEL_NAME = os.environ.get("MODEL_NAME", "qwen2.5-coder-7b-instruct")
27
+ N_CTX = int(os.environ.get("N_CTX", "4096"))
28
+ N_THREADS = int(os.environ.get("N_THREADS", "4"))
29
+ N_GPU_LAYERS = int(os.environ.get("N_GPU_LAYERS", "0")) # 0 = CPU only (HF free tier)
30
+ MAX_TOKENS = int(os.environ.get("MAX_TOKENS", "1024"))
31
+
32
+ if not BEARER_TOKEN:
33
+ logger.warning("⚠️ BEARER_TOKEN is not set β€” all requests will be REJECTED.")
34
+
35
+ # ─── Global model holder ────────────────────────────────────────────────────────
36
+ llm = None
37
+
38
+ # ─── Lifespan: load model once at startup ──────────────────────────────────────
39
+ @asynccontextmanager
40
+ async def lifespan(app: FastAPI):
41
+ global llm
42
+ logger.info(f"πŸ”„ Loading model from: {MODEL_PATH}")
43
+ try:
44
+ from llama_cpp import Llama
45
+ llm = Llama(
46
+ model_path=MODEL_PATH,
47
+ n_ctx=N_CTX,
48
+ n_threads=N_THREADS,
49
+ n_gpu_layers=N_GPU_LAYERS,
50
+ verbose=False,
51
+ chat_format="chatml", # works for Qwen2.5 / most instruct models
52
+ )
53
+ logger.info("βœ… Model loaded successfully.")
54
+ except Exception as e:
55
+ logger.error(f"❌ Failed to load model: {e}")
56
+ raise RuntimeError(f"Model load failed: {e}")
57
+ yield
58
+ logger.info("πŸ›‘ Shutting down.")
59
+
60
+ # ─── App ────────────────────────────────────────────────────────────────────────
61
+ app = FastAPI(
62
+ title="Minecraft Bot LLM API",
63
+ description="OpenAI-compatible LLM backend for Minecraft automation bot",
64
+ version="1.0.0",
65
+ lifespan=lifespan,
66
+ )
67
+
68
+ app.add_middleware(
69
+ CORSMiddleware,
70
+ allow_origins=["*"],
71
+ allow_methods=["*"],
72
+ allow_headers=["*"],
73
+ )
74
+
75
+ # ─── Auth ────────────────────────────────────────────────────────────────────────
76
+ security = HTTPBearer()
77
+
78
+ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
79
+ if not BEARER_TOKEN:
80
+ raise HTTPException(status_code=503, detail="Server misconfigured: BEARER_TOKEN not set.")
81
+ if credentials.credentials != BEARER_TOKEN:
82
+ raise HTTPException(status_code=401, detail="Invalid or missing Bearer Token.")
83
+ return credentials.credentials
84
+
85
+ # ─── Pydantic Schemas (OpenAI-compatible) ──────────────────────────────────────
86
+ class ChatMessage(BaseModel):
87
+ role: str # "system" | "user" | "assistant"
88
+ content: str
89
+
90
+ class ChatCompletionRequest(BaseModel):
91
+ model: Optional[str] = None
92
+ messages: List[ChatMessage]
93
+ max_tokens: Optional[int] = None
94
+ temperature: Optional[float] = 0.2
95
+ top_p: Optional[float] = 0.95
96
+ stream: Optional[bool] = False
97
+ stop: Optional[Union[str, List[str]]] = None
98
+
99
+ class ChatCompletionChoice(BaseModel):
100
+ index: int
101
+ message: ChatMessage
102
+ finish_reason: str
103
+
104
+ class UsageInfo(BaseModel):
105
+ prompt_tokens: int
106
+ completion_tokens: int
107
+ total_tokens: int
108
+
109
+ class ChatCompletionResponse(BaseModel):
110
+ id: str
111
+ object: str = "chat.completion"
112
+ created: int
113
+ model: str
114
+ choices: List[ChatCompletionChoice]
115
+ usage: UsageInfo
116
+
117
+ # ─── Routes ─────────────────────────────────────────────────────���───────────────
118
+ @app.get("/")
119
+ async def root():
120
+ return {"status": "ok", "service": "Minecraft Bot LLM Backend"}
121
+
122
+ @app.get("/health")
123
+ async def health():
124
+ return {"status": "healthy", "model_loaded": llm is not None}
125
+
126
+ @app.get("/v1/models", dependencies=[Depends(verify_token)])
127
+ async def list_models():
128
+ return {
129
+ "object": "list",
130
+ "data": [{"id": MODEL_NAME, "object": "model", "owned_by": "local"}],
131
+ }
132
+
133
+ @app.post("/v1/chat/completions", response_model=ChatCompletionResponse, dependencies=[Depends(verify_token)])
134
+ async def chat_completions(request: ChatCompletionRequest):
135
+ if llm is None:
136
+ raise HTTPException(status_code=503, detail="Model not loaded yet.")
137
+
138
+ messages = [{"role": m.role, "content": m.content} for m in request.messages]
139
+ max_tok = request.max_tokens or MAX_TOKENS
140
+
141
+ logger.info(f"πŸ“¨ Inference request | messages={len(messages)} | max_tokens={max_tok}")
142
+
143
+ try:
144
+ result = llm.create_chat_completion(
145
+ messages=messages,
146
+ max_tokens=max_tok,
147
+ temperature=request.temperature,
148
+ top_p=request.top_p,
149
+ stop=request.stop or [],
150
+ )
151
+ except Exception as e:
152
+ logger.error(f"Inference error: {e}")
153
+ raise HTTPException(status_code=500, detail=f"Inference failed: {e}")
154
+
155
+ choice = result["choices"][0]
156
+ usage = result.get("usage", {})
157
+
158
+ return ChatCompletionResponse(
159
+ id=f"chatcmpl-{uuid.uuid4().hex}",
160
+ created=int(time.time()),
161
+ model=MODEL_NAME,
162
+ choices=[
163
+ ChatCompletionChoice(
164
+ index=0,
165
+ message=ChatMessage(
166
+ role="assistant",
167
+ content=choice["message"]["content"],
168
+ ),
169
+ finish_reason=choice.get("finish_reason", "stop"),
170
+ )
171
+ ],
172
+ usage=UsageInfo(
173
+ prompt_tokens=usage.get("prompt_tokens", 0),
174
+ completion_tokens=usage.get("completion_tokens", 0),
175
+ total_tokens=usage.get("total_tokens", 0),
176
+ ),
177
+ )
178
+
179
+ # ─── Entry point ────────────────────────────────────────────────────────────────
180
+ if __name__ == "__main__":
181
+ uvicorn.run("app:app", host="0.0.0.0", port=7860, workers=1)