Hugh commited on
Commit
37de7ca
·
0 Parent(s):

init: MiniCPM-2B GGUF text Space

Browse files
Files changed (3) hide show
  1. Dockerfile +19 -0
  2. app.py +106 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ build-essential curl cmake \
7
+ && rm -rf /var/lib/apt/lists/*
8
+
9
+ RUN CMAKE_ARGS="-DGGML_BLAS=OFF -DGGML_CUDA=OFF" pip install llama-cpp-python==0.3.8 --no-cache-dir
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install -r requirements.txt --no-cache-dir
13
+
14
+ COPY app.py .
15
+
16
+ HEALTHCHECK --interval=30s --timeout=10s --retries=3 \
17
+ CMD curl -f http://localhost:7860/health || exit 1
18
+
19
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ OpenWolf 文本 Space — MiniCPM-2B GGUF(在线 API 兜底)
3
+ 启动时自动下载模型,提供 OpenAI 兼容的 /v1/chat/completions 接口
4
+ """
5
+ import os
6
+ import time
7
+ import threading
8
+ from pathlib import Path
9
+ from fastapi import FastAPI, Request, HTTPException
10
+ from fastapi.responses import JSONResponse
11
+
12
+ app = FastAPI(title="OpenWolf Text")
13
+
14
+ _ready = False
15
+ _llm = None
16
+ _llm_lock = threading.Lock()
17
+
18
+ MODEL_REPO = "runfuture/MiniCPM-2B-dpo-q4km-gguf"
19
+ MODEL_FILE = "MiniCPM-2B-dpo-q4km-gguf.gguf"
20
+ MODEL_DIR = Path("/app/models")
21
+
22
+
23
+ @app.on_event("startup")
24
+ async def startup():
25
+ threading.Thread(target=_load_model, daemon=True).start()
26
+
27
+
28
+ def _load_model():
29
+ global _llm, _ready
30
+ try:
31
+ MODEL_DIR.mkdir(parents=True, exist_ok=True)
32
+ model_path = MODEL_DIR / MODEL_FILE
33
+
34
+ if not model_path.exists():
35
+ print(f"[models] 下载 {MODEL_REPO}/{MODEL_FILE} (~1.7GB)...")
36
+ from huggingface_hub import hf_hub_download
37
+ t0 = time.time()
38
+ hf_hub_download(
39
+ repo_id=MODEL_REPO,
40
+ filename=MODEL_FILE,
41
+ local_dir=str(MODEL_DIR),
42
+ )
43
+ print(f"[models] 下载完成 ({time.time()-t0:.1f}s)")
44
+
45
+ print("[models] 加载 GGUF 模型...")
46
+ t0 = time.time()
47
+ from llama_cpp import Llama
48
+ _llm = Llama(
49
+ model_path=str(model_path),
50
+ n_ctx=2048,
51
+ n_threads=2,
52
+ n_gpu_layers=0,
53
+ verbose=False,
54
+ use_mmap=True,
55
+ )
56
+ _ready = True
57
+ print(f"[models] 加载完成 ({time.time()-t0:.1f}s)")
58
+ except Exception as e:
59
+ print(f"[models] 加载失败: {e}")
60
+
61
+
62
+ @app.get("/health")
63
+ async def health():
64
+ return {"status": "ok", "ready": _ready}
65
+
66
+
67
+ @app.post("/v1/chat/completions")
68
+ async def chat_completions(request: Request):
69
+ if not _ready:
70
+ return JSONResponse({"error": "模型加载中"}, status_code=503)
71
+
72
+ try:
73
+ body = await request.json()
74
+ except:
75
+ raise HTTPException(status_code=400, detail="Invalid JSON")
76
+
77
+ messages = body.get("messages", [])
78
+ max_tokens = int(body.get("max_tokens", 512))
79
+ temperature = float(body.get("temperature", 0.3))
80
+
81
+ prompt = _format_messages(messages)
82
+
83
+ with _llm_lock:
84
+ out = _llm.create_chat_completion(
85
+ messages=[{"role": "user", "content": prompt}],
86
+ max_tokens=max_tokens,
87
+ temperature=temperature,
88
+ )
89
+
90
+ content = out["choices"][0]["message"]["content"].strip()
91
+ return {"choices": [{"message": {"content": content}}]}
92
+
93
+
94
+ def _format_messages(messages):
95
+ parts = []
96
+ for m in messages:
97
+ role = m.get("role", "user")
98
+ content = m.get("content", "")
99
+ if role == "system":
100
+ parts.append(f"<|system|>\n{content}")
101
+ elif role == "user":
102
+ parts.append(f"<|user|>\n{content}")
103
+ elif role == "assistant":
104
+ parts.append(f"<|assistant|>\n{content}")
105
+ parts.append("<|assistant|>\n")
106
+ return "\n".join(parts)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi==0.115.6
2
+ uvicorn[standard]==0.34.0
3
+ pydantic==2.10.4
4
+ huggingface-hub==0.27.1