Spaces:

hugh007
/

openwolf-text

Sleeping

App Files Files Community

hugh007 commited on 21 days ago

Commit

0745c6f

verified ·

1 Parent(s): 0677a3d

feat: switch to MiniCPM-V-4.6-Thinking GGUF

Browse files

Files changed (2) hide show

Dockerfile +9 -0
app.py +17 -28

Dockerfile CHANGED Viewed

@@ -6,6 +6,15 @@ WORKDIR /app
 RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
     --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
 COPY requirements.txt .
 RUN pip install -r requirements.txt --no-cache-dir

 RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
     --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+# 下载 GGUF 模型（构建时打包）
+RUN apt-get update && apt-get install -y --no-install-recommends curl \
+    && rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /app/models && \
+    curl -sL -o /app/models/MiniCPM-V-4_6-Thinking-Q4_K_M.gguf \
+    "https://huggingface.co/openbmb/MiniCPM-V-4.6-Thinking-gguf/resolve/main/MiniCPM-V-4_6-Thinking-Q4_K_M.gguf" && \
+    curl -sL -o /app/models/mmproj-model-f16.gguf \
+    "https://huggingface.co/openbmb/MiniCPM-V-4.6-Thinking-gguf/resolve/main/mmproj-model-f16.gguf"
 COPY requirements.txt .
 RUN pip install -r requirements.txt --no-cache-dir

app.py CHANGED Viewed

@@ -1,9 +1,7 @@
 """
-OpenWolf 文本 Space — llama-cpp-python（GGUF / MiniCPM）
-模型运行时下载（和 OpenWolf-Agent 一样的方式）
 """
 import os, time, threading, uuid
-from pathlib import Path
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
@@ -13,9 +11,8 @@ _ready = False
 _llm = None
 _llm_lock = threading.Lock()
 _tasks = {}
-MODEL_REPO = "openbmb/MiniCPM3-4B-GGUF"
-MODEL_FILE = "minicpm3-4b-q4_k_m.gguf"
-MODEL_DIR = Path("/app/models")
 @app.on_event("startup")
@@ -25,25 +22,15 @@ async def startup():
 def _load_model():
     global _llm, _ready
-    MODEL_DIR.mkdir(parents=True, exist_ok=True)
-    model_path = MODEL_DIR / MODEL_FILE
-    if not model_path.exists():
-        print(f"[models] 下载 {MODEL_REPO}/{MODEL_FILE}...")
-        from huggingface_hub import hf_hub_download
-        t0 = time.time()
-        try:
-            hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=str(MODEL_DIR))
-            print(f"[models] 下载完成 ({time.time()-t0:.1f}s)")
-        except Exception as e:
-            print(f"[models] 下载失败: {e}")
-            return
-    print("[models] 加载 GGUF 模型...")
     t0 = time.time()
     from llama_cpp import Llama
     try:
-        _llm = Llama(model_path=str(model_path), n_ctx=1024, n_threads=2, n_gpu_layers=0, verbose=False)
         _ready = True
         print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
     except Exception as e:
@@ -62,11 +49,10 @@ async def chat_completions(request: Request):
     body = await request.json()
     messages = body.get("messages", [])
     max_tokens = int(body.get("max_tokens", 512))
-    temperature = float(body.get("temperature", 0.3))
-    prompt = messages[-1]["content"] if messages else ""
     with _llm_lock:
-        out = _llm.create_completion(prompt, max_tokens=max_tokens, temperature=temperature)
-    return {"choices": [{"message": {"content": out["choices"][0]["text"].strip()}}]}
 @app.post("/task/start")
@@ -112,7 +98,10 @@ def _do_task(task_id, body):
         return
     try:
         with _llm_lock:
-            out = _llm.create_completion(text, max_tokens=2048, temperature=0.3)
-        _tasks[task_id] = {"status": "done", "result": out["choices"][0]["text"]}
     except Exception as e:
         _tasks[task_id] = {"status": "error", "result": str(e)}

 """
+OpenWolf 文本 Space — llama-cpp-python（GGUF / MiniCPM-V-4.6-Thinking）
 """
 import os, time, threading, uuid
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse
 _llm = None
 _llm_lock = threading.Lock()
 _tasks = {}
+MODEL_PATH = "/app/models/MiniCPM-V-4_6-Thinking-Q4_K_M.gguf"
+MMPROJ_PATH = "/app/models/mmproj-model-f16.gguf"
 @app.on_event("startup")
 def _load_model():
     global _llm, _ready
+    if not os.path.exists(MODEL_PATH):
+        print(f"[models] 模型文件不存在: {MODEL_PATH}")
+        return
+    print("[models] 加载模型...")
     t0 = time.time()
     from llama_cpp import Llama
     try:
+        _llm = Llama(model_path=MODEL_PATH, mmproj=MMPROJ_PATH,
+                     n_ctx=2048, n_threads=2, n_gpu_layers=0, verbose=False)
         _ready = True
         print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
     except Exception as e:
     body = await request.json()
     messages = body.get("messages", [])
     max_tokens = int(body.get("max_tokens", 512))
+    temp = float(body.get("temperature", 0.3))
     with _llm_lock:
+        out = _llm.create_chat_completion(messages=messages, max_tokens=max_tokens, temperature=temp)
+    return {"choices": [{"message": {"content": out["choices"][0]["message"]["content"].strip()}}]}
 @app.post("/task/start")
         return
     try:
         with _llm_lock:
+            out = _llm.create_chat_completion(
+                messages=[{"role": "user", "content": text}],
+                max_tokens=2048, temperature=0.3,
+            )
+        _tasks[task_id] = {"status": "done", "result": out["choices"][0]["message"]["content"]}
     except Exception as e:
         _tasks[task_id] = {"status": "error", "result": str(e)}