Spaces:
Running
Running
Upload 5 files
Browse files- Dockerfile +30 -0
- README.md +11 -11
- app.py +79 -0
- gitattributes +35 -0
- requirements.txt +4 -0
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
# Install system dependencies
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
build-essential \
|
| 6 |
+
python3-dev \
|
| 7 |
+
curl \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Set up the Hugging Face standard user
|
| 11 |
+
RUN useradd -m -u 1000 user
|
| 12 |
+
USER user
|
| 13 |
+
ENV HOME=/home/user \
|
| 14 |
+
PATH=/home/user/.local/bin:$PATH
|
| 15 |
+
WORKDIR $HOME/app
|
| 16 |
+
|
| 17 |
+
# CRITICAL FIX: Tell pip NOT to use a cached build, and enforce a clean wheel
|
| 18 |
+
ENV PIP_PREFER_BINARY=1
|
| 19 |
+
|
| 20 |
+
# Install llama-cpp-python using pre-compiled wheels for CPU
|
| 21 |
+
RUN pip install --no-cache-dir llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
|
| 22 |
+
|
| 23 |
+
# Copy requirements and install the remaining packages (FastAPI, etc.)
|
| 24 |
+
COPY --chown=user requirements.txt .
|
| 25 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 26 |
+
|
| 27 |
+
COPY --chown=user app.py .
|
| 28 |
+
|
| 29 |
+
EXPOSE 7860
|
| 30 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: MiniCPM5
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: green
|
| 6 |
-
sdk: docker
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
---
|
| 10 |
-
|
| 11 |
-
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: MiniCPM5-1B API Server
|
| 3 |
+
emoji: 🚀
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
An OpenAI-compatible API server running MiniCPM5-1B (Q8_0 GGUF) on Basic CPU.
|
app.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import logging
|
| 3 |
+
from fastapi import FastAPI, HTTPException
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
from typing import List, Optional, Union, Dict, Any
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
+
from llama_cpp import Llama
|
| 8 |
+
|
| 9 |
+
# Set up logging to console for Hugging Face Container Logs
|
| 10 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 11 |
+
logger = logging.getLogger("MiniCPM-API")
|
| 12 |
+
|
| 13 |
+
app = FastAPI(title="MiniCPM5-1B GGUF API")
|
| 14 |
+
|
| 15 |
+
# --- Model Configuration & Download ---
|
| 16 |
+
# Using an available GGUF repository for MiniCPM5-1B
|
| 17 |
+
REPO_ID = "openbmb/MiniCPM5-1B-GGUF"
|
| 18 |
+
FILENAME = "MiniCPM5-1B-Q8_0.gguf" # Note the capital M, C, P, M, B, and Q
|
| 19 |
+
|
| 20 |
+
logger.info(f"Downloading model {FILENAME} from {REPO_ID}...")
|
| 21 |
+
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
|
| 22 |
+
logger.info("Model downloaded successfully!")
|
| 23 |
+
|
| 24 |
+
# Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096)
|
| 25 |
+
# Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes.
|
| 26 |
+
llm = Llama(model_path=model_path, n_ctx=2048, embedding=False)
|
| 27 |
+
|
| 28 |
+
# --- OpenAI Structure Pydantic Models ---
|
| 29 |
+
class ChatMessage(BaseModel):
|
| 30 |
+
role: str
|
| 31 |
+
content: str
|
| 32 |
+
|
| 33 |
+
class ChatCompletionRequest(BaseModel):
|
| 34 |
+
messages: List[ChatMessage]
|
| 35 |
+
temperature: Optional[float] = 0.7
|
| 36 |
+
top_p: Optional[float] = 0.9
|
| 37 |
+
max_tokens: Optional[int] = 512
|
| 38 |
+
stream: Optional[bool] = False
|
| 39 |
+
|
| 40 |
+
@app.get("/")
|
| 41 |
+
def home():
|
| 42 |
+
return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"}
|
| 43 |
+
|
| 44 |
+
@app.post("/v1/chat/completions")
|
| 45 |
+
def chat_completions(request: ChatCompletionRequest):
|
| 46 |
+
# 1. Format the Prompt Logging
|
| 47 |
+
logger.info("====== NEW REQUEST RECEIVED ======")
|
| 48 |
+
for msg in request.messages:
|
| 49 |
+
logger.info(f"[{msg.role.upper()}]: {msg.content}")
|
| 50 |
+
logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}")
|
| 51 |
+
|
| 52 |
+
# 2. Build template manually or map roles
|
| 53 |
+
# MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax.
|
| 54 |
+
# llama-cpp-python can parse standard chat dictionaries directly.
|
| 55 |
+
formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages]
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
# 3. Invoke inference via llama-cpp
|
| 59 |
+
response = llm.create_chat_completion(
|
| 60 |
+
messages=formatted_messages,
|
| 61 |
+
temperature=request.temperature,
|
| 62 |
+
top_p=request.top_p,
|
| 63 |
+
max_tokens=request.max_tokens,
|
| 64 |
+
stream=False # Keep false for basic JSON response handling
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
assistant_response = response["choices"][0]["message"]["content"]
|
| 68 |
+
logger.info(f"[ASSISTANT]: {assistant_response}")
|
| 69 |
+
logger.info("==================================")
|
| 70 |
+
|
| 71 |
+
return response
|
| 72 |
+
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"Inference failed: {str(e)}")
|
| 75 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 76 |
+
|
| 77 |
+
if __name__ == "__main__":
|
| 78 |
+
import uvicorn
|
| 79 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pydantic
|
| 4 |
+
huggingface_hub
|