MiniCPM5-1B-API / app.py
CrazyQuantz's picture
Upload 5 files
ba10c2b verified
import os
import logging
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from typing import List, Optional, Union, Dict, Any
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# Set up logging to console for Hugging Face Container Logs
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("MiniCPM-API")
app = FastAPI(title="MiniCPM5-1B GGUF API")
# --- Model Configuration & Download ---
# Using an available GGUF repository for MiniCPM5-1B
REPO_ID = "openbmb/MiniCPM5-1B-GGUF"
FILENAME = "MiniCPM5-1B-Q8_0.gguf" # Note the capital M, C, P, M, B, and Q
logger.info(f"Downloading model {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
logger.info("Model downloaded successfully!")
# Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096)
# Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes.
llm = Llama(model_path=model_path, n_ctx=2048, embedding=False)
# --- OpenAI Structure Pydantic Models ---
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
messages: List[ChatMessage]
temperature: Optional[float] = 0.7
top_p: Optional[float] = 0.9
max_tokens: Optional[int] = 512
stream: Optional[bool] = False
@app.get("/")
def home():
return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"}
@app.post("/v1/chat/completions")
def chat_completions(request: ChatCompletionRequest):
# 1. Format the Prompt Logging
logger.info("====== NEW REQUEST RECEIVED ======")
for msg in request.messages:
logger.info(f"[{msg.role.upper()}]: {msg.content}")
logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}")
# 2. Build template manually or map roles
# MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax.
# llama-cpp-python can parse standard chat dictionaries directly.
formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages]
try:
# 3. Invoke inference via llama-cpp
response = llm.create_chat_completion(
messages=formatted_messages,
temperature=request.temperature,
top_p=request.top_p,
max_tokens=request.max_tokens,
stream=False # Keep false for basic JSON response handling
)
assistant_response = response["choices"][0]["message"]["content"]
logger.info(f"[ASSISTANT]: {assistant_response}")
logger.info("==================================")
return response
except Exception as e:
logger.error(f"Inference failed: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)