hugh007 commited on
Commit
0677a3d
·
verified ·
1 Parent(s): 0bf1c9b

fix: use hf_hub_download at runtime like OpenWolf-Agent

Browse files
Files changed (2) hide show
  1. Dockerfile +0 -7
  2. app.py +22 -11
Dockerfile CHANGED
@@ -6,13 +6,6 @@ WORKDIR /app
6
  RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
7
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
8
 
9
- # 下载 GGUF 模型(构建时打包进镜像)
10
- RUN apt-get update && apt-get install -y --no-install-recommends curl \
11
- && rm -rf /var/lib/apt/lists/*
12
- RUN mkdir -p /app/models && \
13
- curl -sL -o /app/models/minicpm3-4b-q4_k_m.gguf \
14
- "https://huggingface.co/openbmb/MiniCPM3-4B-GGUF/resolve/main/minicpm3-4b-q4_k_m.gguf"
15
-
16
  COPY requirements.txt .
17
  RUN pip install -r requirements.txt --no-cache-dir
18
 
 
6
  RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
7
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
8
 
 
 
 
 
 
 
 
9
  COPY requirements.txt .
10
  RUN pip install -r requirements.txt --no-cache-dir
11
 
app.py CHANGED
@@ -1,8 +1,9 @@
1
  """
2
- OpenWolf 文本 Space — llama-cpp-python(GGUF / MiniCPM-2B
3
- 模型已内置在 Docker 镜像中
4
  """
5
  import os, time, threading, uuid
 
6
  from fastapi import FastAPI, Request
7
  from fastapi.responses import JSONResponse
8
 
@@ -12,7 +13,9 @@ _ready = False
12
  _llm = None
13
  _llm_lock = threading.Lock()
14
  _tasks = {}
15
- MODEL_PATH = "/app/models/minicpm3-4b-q4_k_m.gguf"
 
 
16
 
17
 
18
  @app.on_event("startup")
@@ -22,14 +25,25 @@ async def startup():
22
 
23
  def _load_model():
24
  global _llm, _ready
25
- if not os.path.exists(MODEL_PATH):
26
- print(f"[models] 模型文件不存在: {MODEL_PATH}")
27
- return
 
 
 
 
 
 
 
 
 
 
 
28
  print("[models] 加载 GGUF 模型...")
29
  t0 = time.time()
30
  from llama_cpp import Llama
31
  try:
32
- _llm = Llama(model_path=MODEL_PATH, n_ctx=1024, n_threads=2, n_gpu_layers=0, verbose=False, use_mmap=False)
33
  _ready = True
34
  print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
35
  except Exception as e:
@@ -43,7 +57,6 @@ async def health():
43
 
44
  @app.post("/v1/chat/completions")
45
  async def chat_completions(request: Request):
46
- global _llm
47
  if not _ready or _llm is None:
48
  return JSONResponse({"error": "模型加载中"}, status_code=503)
49
  body = await request.json()
@@ -53,8 +66,7 @@ async def chat_completions(request: Request):
53
  prompt = messages[-1]["content"] if messages else ""
54
  with _llm_lock:
55
  out = _llm.create_completion(prompt, max_tokens=max_tokens, temperature=temperature)
56
- content = out["choices"][0]["text"].strip()
57
- return {"choices": [{"message": {"content": content}}]}
58
 
59
 
60
  @app.post("/task/start")
@@ -94,7 +106,6 @@ async def analyze_check(task_id: str):
94
 
95
 
96
  def _do_task(task_id, body):
97
- global _llm
98
  text = body.get("task", body.get("text", body.get("question", "")))
99
  if _llm is None:
100
  _tasks[task_id] = {"status": "error", "result": "模型未就绪"}
 
1
  """
2
+ OpenWolf 文本 Space — llama-cpp-python(GGUF / MiniCPM)
3
+ 模型运行时下载(和 OpenWolf-Agent 一样的方式)
4
  """
5
  import os, time, threading, uuid
6
+ from pathlib import Path
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import JSONResponse
9
 
 
13
  _llm = None
14
  _llm_lock = threading.Lock()
15
  _tasks = {}
16
+ MODEL_REPO = "openbmb/MiniCPM3-4B-GGUF"
17
+ MODEL_FILE = "minicpm3-4b-q4_k_m.gguf"
18
+ MODEL_DIR = Path("/app/models")
19
 
20
 
21
  @app.on_event("startup")
 
25
 
26
  def _load_model():
27
  global _llm, _ready
28
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
29
+ model_path = MODEL_DIR / MODEL_FILE
30
+
31
+ if not model_path.exists():
32
+ print(f"[models] 下载 {MODEL_REPO}/{MODEL_FILE}...")
33
+ from huggingface_hub import hf_hub_download
34
+ t0 = time.time()
35
+ try:
36
+ hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=str(MODEL_DIR))
37
+ print(f"[models] 下载完成 ({time.time()-t0:.1f}s)")
38
+ except Exception as e:
39
+ print(f"[models] 下载失败: {e}")
40
+ return
41
+
42
  print("[models] 加载 GGUF 模型...")
43
  t0 = time.time()
44
  from llama_cpp import Llama
45
  try:
46
+ _llm = Llama(model_path=str(model_path), n_ctx=1024, n_threads=2, n_gpu_layers=0, verbose=False)
47
  _ready = True
48
  print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
49
  except Exception as e:
 
57
 
58
  @app.post("/v1/chat/completions")
59
  async def chat_completions(request: Request):
 
60
  if not _ready or _llm is None:
61
  return JSONResponse({"error": "模型加载中"}, status_code=503)
62
  body = await request.json()
 
66
  prompt = messages[-1]["content"] if messages else ""
67
  with _llm_lock:
68
  out = _llm.create_completion(prompt, max_tokens=max_tokens, temperature=temperature)
69
+ return {"choices": [{"message": {"content": out["choices"][0]["text"].strip()}}]}
 
70
 
71
 
72
  @app.post("/task/start")
 
106
 
107
 
108
  def _do_task(task_id, body):
 
109
  text = body.get("task", body.get("text", body.get("question", "")))
110
  if _llm is None:
111
  _tasks[task_id] = {"status": "error", "result": "模型未就绪"}