Spaces:

hugh007
/

openwolf-text

Sleeping

App Files Files Community

hugh007 commited on 21 days ago

Commit

0677a3d

verified ·

1 Parent(s): 0bf1c9b

fix: use hf_hub_download at runtime like OpenWolf-Agent

Browse files

Files changed (2) hide show

Dockerfile +0 -7
app.py +22 -11

Dockerfile CHANGED Viewed

@@ -6,13 +6,6 @@ WORKDIR /app
 RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
     --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
-# 下载 GGUF 模型（构建时打包进镜像）
-RUN apt-get update && apt-get install -y --no-install-recommends curl \
-    && rm -rf /var/lib/apt/lists/*
-RUN mkdir -p /app/models && \
-    curl -sL -o /app/models/minicpm3-4b-q4_k_m.gguf \
-    "https://huggingface.co/openbmb/MiniCPM3-4B-GGUF/resolve/main/minicpm3-4b-q4_k_m.gguf"
 COPY requirements.txt .
 RUN pip install -r requirements.txt --no-cache-dir

 RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
     --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 COPY requirements.txt .
 RUN pip install -r requirements.txt --no-cache-dir

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """
-OpenWolf 文本 Space — llama-cpp-python（GGUF / MiniCPM-2B）
-模型已内置在 Docker 镜像中
 """
 import os, time, threading, uuid
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
@@ -12,7 +13,9 @@ _ready = False
 _llm = None
 _llm_lock = threading.Lock()
 _tasks = {}
-MODEL_PATH = "/app/models/minicpm3-4b-q4_k_m.gguf"
 @app.on_event("startup")
@@ -22,14 +25,25 @@ async def startup():
 def _load_model():
     global _llm, _ready
-    if not os.path.exists(MODEL_PATH):
-        print(f"[models] 模型文件不存在: {MODEL_PATH}")
-        return
     print("[models] 加载 GGUF 模型...")
     t0 = time.time()
     from llama_cpp import Llama
     try:
-        _llm = Llama(model_path=MODEL_PATH, n_ctx=1024, n_threads=2, n_gpu_layers=0, verbose=False, use_mmap=False)
         _ready = True
         print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
     except Exception as e:
@@ -43,7 +57,6 @@ async def health():
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
-    global _llm
     if not _ready or _llm is None:
         return JSONResponse({"error": "模型加载中"}, status_code=503)
     body = await request.json()
@@ -53,8 +66,7 @@ async def chat_completions(request: Request):
     prompt = messages[-1]["content"] if messages else ""
     with _llm_lock:
         out = _llm.create_completion(prompt, max_tokens=max_tokens, temperature=temperature)
-    content = out["choices"][0]["text"].strip()
-    return {"choices": [{"message": {"content": content}}]}
 @app.post("/task/start")
@@ -94,7 +106,6 @@ async def analyze_check(task_id: str):
 def _do_task(task_id, body):
-    global _llm
     text = body.get("task", body.get("text", body.get("question", "")))
     if _llm is None:
         _tasks[task_id] = {"status": "error", "result": "模型未就绪"}

 """
+OpenWolf 文本 Space — llama-cpp-python（GGUF / MiniCPM）
+模型运行时下载（和 OpenWolf-Agent 一样的方式）
 """
 import os, time, threading, uuid
+from pathlib import Path
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 _llm = None
 _llm_lock = threading.Lock()
 _tasks = {}
+MODEL_REPO = "openbmb/MiniCPM3-4B-GGUF"
+MODEL_FILE = "minicpm3-4b-q4_k_m.gguf"
+MODEL_DIR = Path("/app/models")
 @app.on_event("startup")
 def _load_model():
     global _llm, _ready
+    MODEL_DIR.mkdir(parents=True, exist_ok=True)
+    model_path = MODEL_DIR / MODEL_FILE
+    if not model_path.exists():
+        print(f"[models] 下载 {MODEL_REPO}/{MODEL_FILE}...")
+        from huggingface_hub import hf_hub_download
+        t0 = time.time()
+        try:
+            hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=str(MODEL_DIR))
+            print(f"[models] 下载完成 ({time.time()-t0:.1f}s)")
+        except Exception as e:
+            print(f"[models] 下载失败: {e}")
+            return
     print("[models] 加载 GGUF 模型...")
     t0 = time.time()
     from llama_cpp import Llama
     try:
+        _llm = Llama(model_path=str(model_path), n_ctx=1024, n_threads=2, n_gpu_layers=0, verbose=False)
         _ready = True
         print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
     except Exception as e:
 @app.post("/v1/chat/completions")
 async def chat_completions(request: Request):
     if not _ready or _llm is None:
         return JSONResponse({"error": "模型加载中"}, status_code=503)
     body = await request.json()
     prompt = messages[-1]["content"] if messages else ""
     with _llm_lock:
         out = _llm.create_completion(prompt, max_tokens=max_tokens, temperature=temperature)
+    return {"choices": [{"message": {"content": out["choices"][0]["text"].strip()}}]}
 @app.post("/task/start")
 def _do_task(task_id, body):
     text = body.get("task", body.get("text", body.get("question", "")))
     if _llm is None:
         _tasks[task_id] = {"status": "error", "result": "模型未就绪"}