hugh007 commited on
Commit
0745c6f
·
verified ·
1 Parent(s): 0677a3d

feat: switch to MiniCPM-V-4.6-Thinking GGUF

Browse files
Files changed (2) hide show
  1. Dockerfile +9 -0
  2. app.py +17 -28
Dockerfile CHANGED
@@ -6,6 +6,15 @@ WORKDIR /app
6
  RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
7
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
8
 
 
 
 
 
 
 
 
 
 
9
  COPY requirements.txt .
10
  RUN pip install -r requirements.txt --no-cache-dir
11
 
 
6
  RUN pip install --no-cache-dir --timeout 300 llama-cpp-python==0.3.23 \
7
  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
8
 
9
+ # 下载 GGUF 模型(构建时打包)
10
+ RUN apt-get update && apt-get install -y --no-install-recommends curl \
11
+ && rm -rf /var/lib/apt/lists/*
12
+ RUN mkdir -p /app/models && \
13
+ curl -sL -o /app/models/MiniCPM-V-4_6-Thinking-Q4_K_M.gguf \
14
+ "https://huggingface.co/openbmb/MiniCPM-V-4.6-Thinking-gguf/resolve/main/MiniCPM-V-4_6-Thinking-Q4_K_M.gguf" && \
15
+ curl -sL -o /app/models/mmproj-model-f16.gguf \
16
+ "https://huggingface.co/openbmb/MiniCPM-V-4.6-Thinking-gguf/resolve/main/mmproj-model-f16.gguf"
17
+
18
  COPY requirements.txt .
19
  RUN pip install -r requirements.txt --no-cache-dir
20
 
app.py CHANGED
@@ -1,9 +1,7 @@
1
  """
2
- OpenWolf 文本 Space — llama-cpp-python(GGUF / MiniCPM)
3
- 模型运行时下载(和 OpenWolf-Agent 一样的方式)
4
  """
5
  import os, time, threading, uuid
6
- from pathlib import Path
7
  from fastapi import FastAPI, Request
8
  from fastapi.responses import JSONResponse
9
 
@@ -13,9 +11,8 @@ _ready = False
13
  _llm = None
14
  _llm_lock = threading.Lock()
15
  _tasks = {}
16
- MODEL_REPO = "openbmb/MiniCPM3-4B-GGUF"
17
- MODEL_FILE = "minicpm3-4b-q4_k_m.gguf"
18
- MODEL_DIR = Path("/app/models")
19
 
20
 
21
  @app.on_event("startup")
@@ -25,25 +22,15 @@ async def startup():
25
 
26
  def _load_model():
27
  global _llm, _ready
28
- MODEL_DIR.mkdir(parents=True, exist_ok=True)
29
- model_path = MODEL_DIR / MODEL_FILE
30
-
31
- if not model_path.exists():
32
- print(f"[models] 下载 {MODEL_REPO}/{MODEL_FILE}...")
33
- from huggingface_hub import hf_hub_download
34
- t0 = time.time()
35
- try:
36
- hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=str(MODEL_DIR))
37
- print(f"[models] 下载完成 ({time.time()-t0:.1f}s)")
38
- except Exception as e:
39
- print(f"[models] 下载失败: {e}")
40
- return
41
-
42
- print("[models] 加载 GGUF 模型...")
43
  t0 = time.time()
44
  from llama_cpp import Llama
45
  try:
46
- _llm = Llama(model_path=str(model_path), n_ctx=1024, n_threads=2, n_gpu_layers=0, verbose=False)
 
47
  _ready = True
48
  print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
49
  except Exception as e:
@@ -62,11 +49,10 @@ async def chat_completions(request: Request):
62
  body = await request.json()
63
  messages = body.get("messages", [])
64
  max_tokens = int(body.get("max_tokens", 512))
65
- temperature = float(body.get("temperature", 0.3))
66
- prompt = messages[-1]["content"] if messages else ""
67
  with _llm_lock:
68
- out = _llm.create_completion(prompt, max_tokens=max_tokens, temperature=temperature)
69
- return {"choices": [{"message": {"content": out["choices"][0]["text"].strip()}}]}
70
 
71
 
72
  @app.post("/task/start")
@@ -112,7 +98,10 @@ def _do_task(task_id, body):
112
  return
113
  try:
114
  with _llm_lock:
115
- out = _llm.create_completion(text, max_tokens=2048, temperature=0.3)
116
- _tasks[task_id] = {"status": "done", "result": out["choices"][0]["text"]}
 
 
 
117
  except Exception as e:
118
  _tasks[task_id] = {"status": "error", "result": str(e)}
 
1
  """
2
+ OpenWolf 文本 Space — llama-cpp-python(GGUF / MiniCPM-V-4.6-Thinking
 
3
  """
4
  import os, time, threading, uuid
 
5
  from fastapi import FastAPI, Request
6
  from fastapi.responses import JSONResponse
7
 
 
11
  _llm = None
12
  _llm_lock = threading.Lock()
13
  _tasks = {}
14
+ MODEL_PATH = "/app/models/MiniCPM-V-4_6-Thinking-Q4_K_M.gguf"
15
+ MMPROJ_PATH = "/app/models/mmproj-model-f16.gguf"
 
16
 
17
 
18
  @app.on_event("startup")
 
22
 
23
  def _load_model():
24
  global _llm, _ready
25
+ if not os.path.exists(MODEL_PATH):
26
+ print(f"[models] 模型文件不存在: {MODEL_PATH}")
27
+ return
28
+ print("[models] 加载模型...")
 
 
 
 
 
 
 
 
 
 
 
29
  t0 = time.time()
30
  from llama_cpp import Llama
31
  try:
32
+ _llm = Llama(model_path=MODEL_PATH, mmproj=MMPROJ_PATH,
33
+ n_ctx=2048, n_threads=2, n_gpu_layers=0, verbose=False)
34
  _ready = True
35
  print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
36
  except Exception as e:
 
49
  body = await request.json()
50
  messages = body.get("messages", [])
51
  max_tokens = int(body.get("max_tokens", 512))
52
+ temp = float(body.get("temperature", 0.3))
 
53
  with _llm_lock:
54
+ out = _llm.create_chat_completion(messages=messages, max_tokens=max_tokens, temperature=temp)
55
+ return {"choices": [{"message": {"content": out["choices"][0]["message"]["content"].strip()}}]}
56
 
57
 
58
  @app.post("/task/start")
 
98
  return
99
  try:
100
  with _llm_lock:
101
+ out = _llm.create_chat_completion(
102
+ messages=[{"role": "user", "content": text}],
103
+ max_tokens=2048, temperature=0.3,
104
+ )
105
+ _tasks[task_id] = {"status": "done", "result": out["choices"][0]["message"]["content"]}
106
  except Exception as e:
107
  _tasks[task_id] = {"status": "error", "result": str(e)}