Spaces:
Sleeping
Sleeping
File size: 3,667 Bytes
37de7ca 0745c6f 37de7ca ac7a783 37de7ca ac7a783 9d01ea4 82ad52b 0745c6f 37de7ca ac7a783 9d01ea4 ac7a783 0745c6f ac7a783 0745c6f ac7a783 37de7ca ac7a783 37de7ca 9d01ea4 ac7a783 0745c6f ac7a783 0745c6f 37de7ca 8067761 9d01ea4 8067761 ac7a783 8067761 9d01ea4 8067761 9d01ea4 ac7a783 8067761 ac7a783 0745c6f 8067761 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | """
OpenWolf 文本 Space — llama-cpp-python(GGUF / MiniCPM-V-4.6-Thinking)
"""
import os, time, threading, uuid
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
app = FastAPI(title="OpenWolf Text")
_ready = False
_llm = None
_llm_lock = threading.Lock()
_tasks = {}
MODEL_PATH = "/app/models/MiniCPM-V-4_6-Thinking-Q6_K.gguf"
MMPROJ_PATH = "/app/models/mmproj-model-f16.gguf"
@app.on_event("startup")
async def startup():
threading.Thread(target=_load_model, daemon=True).start()
def _load_model():
global _llm, _ready
if not os.path.exists(MODEL_PATH):
print(f"[models] 模型文件不存在: {MODEL_PATH}")
return
print("[models] 加载模型...")
t0 = time.time()
from llama_cpp import Llama
try:
_llm = Llama(model_path=MODEL_PATH, mmproj=MMPROJ_PATH,
n_ctx=2048, n_threads=2, n_gpu_layers=0, verbose=False)
_ready = True
print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
except Exception as e:
print(f"[models] 加载失败: {e}")
@app.get("/health")
async def health():
return {"status": "ok", "ready": _ready}
@app.post("/v1/chat/completions")
async def chat_completions(request: Request):
if not _ready or _llm is None:
return JSONResponse({"error": "模型加载中"}, status_code=503)
body = await request.json()
messages = body.get("messages", [])
max_tokens = int(body.get("max_tokens", 512))
temp = float(body.get("temperature", 0.3))
with _llm_lock:
out = _llm.create_chat_completion(messages=messages, max_tokens=max_tokens, temperature=temp)
return {"choices": [{"message": {"content": out["choices"][0]["message"]["content"].strip()}}]}
@app.post("/task/start")
async def task_start(request: Request):
body = await request.json()
task_id = uuid.uuid4().hex[:12]
_tasks[task_id] = {"status": "running", "result": None}
threading.Thread(target=_do_task, args=(task_id, body), daemon=True).start()
return {"ok": True, "task_id": task_id}
@app.get("/task/check/{task_id}")
async def task_check(task_id: str):
t = _tasks.get(task_id)
if not t: return {"status": "error", "result": "not found"}
if t["status"] == "running": return {"status": "running"}
return {"status": t["status"], "result": t["result"]}
@app.post("/analyze-doc/start")
@app.post("/analyze-text/start")
async def analyze_start(request: Request):
body = await request.json()
task_id = uuid.uuid4().hex[:12]
_tasks[task_id] = {"status": "running", "result": None}
threading.Thread(target=_do_task, args=(task_id, body), daemon=True).start()
return {"ok": True, "task_id": task_id}
@app.get("/analyze-doc/check/{task_id}")
@app.get("/analyze-text/check/{task_id}")
async def analyze_check(task_id: str):
t = _tasks.get(task_id)
if not t: return {"status": "error", "result": "not found"}
if t["status"] == "running": return {"status": "running"}
return {"status": t["status"], "result": t["result"]}
def _do_task(task_id, body):
text = body.get("task", body.get("text", body.get("question", "")))
if _llm is None:
_tasks[task_id] = {"status": "error", "result": "模型未就绪"}
return
try:
with _llm_lock:
out = _llm.create_chat_completion(
messages=[{"role": "user", "content": text}],
max_tokens=2048, temperature=0.3,
)
_tasks[task_id] = {"status": "done", "result": out["choices"][0]["message"]["content"]}
except Exception as e:
_tasks[task_id] = {"status": "error", "result": str(e)}
|