""" OpenWolf 视觉 Space — llama-cpp-python(GGUF / MiniCPM-V / 多模态) 模型已内置在 Docker 镜像中 """ import os, time, threading, uuid from fastapi import FastAPI, Request from fastapi.responses import JSONResponse app = FastAPI(title="OpenWolf Vision") _ready = False _llm = None _llm_lock = threading.Lock() _tasks = {} MODEL_PATH = "/app/models/MiniCPM-V-4.6-Q4_K_M.gguf" MMPROJ_PATH = "/app/models/mmproj-MiniCPM-V-4.6-Q8_0.gguf" @app.on_event("startup") async def startup(): threading.Thread(target=_load_model, daemon=True).start() def _load_model(): global _llm, _ready if not os.path.exists(MODEL_PATH): print(f"[models] 模型文件不存在: {MODEL_PATH}") return print("[models] 加载多模态模型...") t0 = time.time() from llama_cpp import Llama try: _llm = Llama( model_path=MODEL_PATH, mmproj=MMPROJ_PATH, n_ctx=4096, n_threads=2, n_gpu_layers=0, verbose=False, ) _ready = True print(f"[models] 加载完成 ({time.time()-t0:.1f}s)") except Exception as e: print(f"[models] 加载失败: {e}") @app.get("/health") async def health(): return {"status": "ok", "ready": _ready} @app.post("/v1/chat/completions") async def chat_completions(request: Request): if not _ready or _llm is None: return JSONResponse({"error": "模型加载中"}, status_code=503) body = await request.json() messages = body.get("messages", []) max_tokens = int(body.get("max_tokens", 512)) temperature = float(body.get("temperature", 0.3)) with _llm_lock: out = _llm.create_chat_completion(messages=messages, max_tokens=max_tokens, temperature=temperature) content = out["choices"][0]["message"]["content"].strip() return {"choices": [{"message": {"content": content}}]} @app.post("/task/start") async def task_start(request: Request): body = await request.json() task_id = uuid.uuid4().hex[:12] _tasks[task_id] = {"status": "running", "result": None} threading.Thread(target=_do_task, args=(task_id, body), daemon=True).start() return {"ok": True, "task_id": task_id} @app.get("/task/check/{task_id}") async def task_check(task_id: str): t = _tasks.get(task_id) if not t: return {"status": "error", "result": "not found"} if t["status"] == "running": return {"status": "running"} return {"status": t["status"], "result": t["result"]} def _do_task(task_id, body): text = body.get("task", body.get("text", body.get("question", ""))) if _llm is None: _tasks[task_id] = {"status": "error", "result": "模型未就绪"} return try: with _llm_lock: out = _llm.create_chat_completion( messages=[{"role": "user", "content": text}], max_tokens=2048, temperature=0.3, ) _tasks[task_id] = {"status": "done", "result": out["choices"][0]["message"]["content"]} except Exception as e: _tasks[task_id] = {"status": "error", "result": str(e)}