openwolf-vision / app.py
hugh007's picture
fix: use pre-compiled llama-cpp-python wheel + model in image
0408500 verified
"""
OpenWolf 视觉 Space — llama-cpp-python(GGUF / MiniCPM-V / 多模态)
模型已内置在 Docker 镜像中
"""
import os, time, threading, uuid
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
app = FastAPI(title="OpenWolf Vision")
_ready = False
_llm = None
_llm_lock = threading.Lock()
_tasks = {}
MODEL_PATH = "/app/models/MiniCPM-V-4.6-Q4_K_M.gguf"
MMPROJ_PATH = "/app/models/mmproj-MiniCPM-V-4.6-Q8_0.gguf"
@app.on_event("startup")
async def startup():
threading.Thread(target=_load_model, daemon=True).start()
def _load_model():
global _llm, _ready
if not os.path.exists(MODEL_PATH):
print(f"[models] 模型文件不存在: {MODEL_PATH}")
return
print("[models] 加载多模态模型...")
t0 = time.time()
from llama_cpp import Llama
try:
_llm = Llama(
model_path=MODEL_PATH, mmproj=MMPROJ_PATH,
n_ctx=4096, n_threads=2, n_gpu_layers=0, verbose=False,
)
_ready = True
print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
except Exception as e:
print(f"[models] 加载失败: {e}")
@app.get("/health")
async def health():
return {"status": "ok", "ready": _ready}
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
if not _ready or _llm is None:
return JSONResponse({"error": "模型加载中"}, status_code=503)
body = await request.json()
messages = body.get("messages", [])
max_tokens = int(body.get("max_tokens", 512))
temperature = float(body.get("temperature", 0.3))
with _llm_lock:
out = _llm.create_chat_completion(messages=messages, max_tokens=max_tokens, temperature=temperature)
content = out["choices"][0]["message"]["content"].strip()
return {"choices": [{"message": {"content": content}}]}
@app.post("/task/start")
async def task_start(request: Request):
body = await request.json()
task_id = uuid.uuid4().hex[:12]
_tasks[task_id] = {"status": "running", "result": None}
threading.Thread(target=_do_task, args=(task_id, body), daemon=True).start()
return {"ok": True, "task_id": task_id}
@app.get("/task/check/{task_id}")
async def task_check(task_id: str):
t = _tasks.get(task_id)
if not t: return {"status": "error", "result": "not found"}
if t["status"] == "running": return {"status": "running"}
return {"status": t["status"], "result": t["result"]}
def _do_task(task_id, body):
text = body.get("task", body.get("text", body.get("question", "")))
if _llm is None:
_tasks[task_id] = {"status": "error", "result": "模型未就绪"}
return
try:
with _llm_lock:
out = _llm.create_chat_completion(
messages=[{"role": "user", "content": text}],
max_tokens=2048, temperature=0.3,
)
_tasks[task_id] = {"status": "done", "result": out["choices"][0]["message"]["content"]}
except Exception as e:
_tasks[task_id] = {"status": "error", "result": str(e)}