CrazyQuantz commited on
Commit
ba10c2b
·
verified ·
1 Parent(s): 1fbf0b4

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +30 -0
  2. README.md +11 -11
  3. app.py +79 -0
  4. gitattributes +35 -0
  5. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ build-essential \
6
+ python3-dev \
7
+ curl \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Set up the Hugging Face standard user
11
+ RUN useradd -m -u 1000 user
12
+ USER user
13
+ ENV HOME=/home/user \
14
+ PATH=/home/user/.local/bin:$PATH
15
+ WORKDIR $HOME/app
16
+
17
+ # CRITICAL FIX: Tell pip NOT to use a cached build, and enforce a clean wheel
18
+ ENV PIP_PREFER_BINARY=1
19
+
20
+ # Install llama-cpp-python using pre-compiled wheels for CPU
21
+ RUN pip install --no-cache-dir llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
22
+
23
+ # Copy requirements and install the remaining packages (FastAPI, etc.)
24
+ COPY --chown=user requirements.txt .
25
+ RUN pip install --no-cache-dir -r requirements.txt
26
+
27
+ COPY --chown=user app.py .
28
+
29
+ EXPOSE 7860
30
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,11 +1,11 @@
1
- ---
2
- title: MiniCPM5 1B API
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: green
6
- sdk: docker
7
- pinned: false
8
- license: apache-2.0
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: MiniCPM5-1B API Server
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ An OpenAI-compatible API server running MiniCPM5-1B (Q8_0 GGUF) on Basic CPU.
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel, Field
5
+ from typing import List, Optional, Union, Dict, Any
6
+ from huggingface_hub import hf_hub_download
7
+ from llama_cpp import Llama
8
+
9
+ # Set up logging to console for Hugging Face Container Logs
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
11
+ logger = logging.getLogger("MiniCPM-API")
12
+
13
+ app = FastAPI(title="MiniCPM5-1B GGUF API")
14
+
15
+ # --- Model Configuration & Download ---
16
+ # Using an available GGUF repository for MiniCPM5-1B
17
+ REPO_ID = "openbmb/MiniCPM5-1B-GGUF"
18
+ FILENAME = "MiniCPM5-1B-Q8_0.gguf" # Note the capital M, C, P, M, B, and Q
19
+
20
+ logger.info(f"Downloading model {FILENAME} from {REPO_ID}...")
21
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
22
+ logger.info("Model downloaded successfully!")
23
+
24
+ # Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096)
25
+ # Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes.
26
+ llm = Llama(model_path=model_path, n_ctx=2048, embedding=False)
27
+
28
+ # --- OpenAI Structure Pydantic Models ---
29
+ class ChatMessage(BaseModel):
30
+ role: str
31
+ content: str
32
+
33
+ class ChatCompletionRequest(BaseModel):
34
+ messages: List[ChatMessage]
35
+ temperature: Optional[float] = 0.7
36
+ top_p: Optional[float] = 0.9
37
+ max_tokens: Optional[int] = 512
38
+ stream: Optional[bool] = False
39
+
40
+ @app.get("/")
41
+ def home():
42
+ return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"}
43
+
44
+ @app.post("/v1/chat/completions")
45
+ def chat_completions(request: ChatCompletionRequest):
46
+ # 1. Format the Prompt Logging
47
+ logger.info("====== NEW REQUEST RECEIVED ======")
48
+ for msg in request.messages:
49
+ logger.info(f"[{msg.role.upper()}]: {msg.content}")
50
+ logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}")
51
+
52
+ # 2. Build template manually or map roles
53
+ # MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax.
54
+ # llama-cpp-python can parse standard chat dictionaries directly.
55
+ formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages]
56
+
57
+ try:
58
+ # 3. Invoke inference via llama-cpp
59
+ response = llm.create_chat_completion(
60
+ messages=formatted_messages,
61
+ temperature=request.temperature,
62
+ top_p=request.top_p,
63
+ max_tokens=request.max_tokens,
64
+ stream=False # Keep false for basic JSON response handling
65
+ )
66
+
67
+ assistant_response = response["choices"][0]["message"]["content"]
68
+ logger.info(f"[ASSISTANT]: {assistant_response}")
69
+ logger.info("==================================")
70
+
71
+ return response
72
+
73
+ except Exception as e:
74
+ logger.error(f"Inference failed: {str(e)}")
75
+ raise HTTPException(status_code=500, detail=str(e))
76
+
77
+ if __name__ == "__main__":
78
+ import uvicorn
79
+ uvicorn.run(app, host="0.0.0.0", port=7860)
gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pydantic
4
+ huggingface_hub