import os import logging from fastapi import FastAPI, HTTPException from pydantic import BaseModel, Field from typing import List, Optional, Union, Dict, Any from huggingface_hub import hf_hub_download from llama_cpp import Llama # Set up logging to console for Hugging Face Container Logs logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger("MiniCPM-API") app = FastAPI(title="MiniCPM5-1B GGUF API") # --- Model Configuration & Download --- # Using an available GGUF repository for MiniCPM5-1B REPO_ID = "openbmb/MiniCPM5-1B-GGUF" FILENAME = "MiniCPM5-1B-Q8_0.gguf" # Note the capital M, C, P, M, B, and Q logger.info(f"Downloading model {FILENAME} from {REPO_ID}...") model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) logger.info("Model downloaded successfully!") # Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096) # Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes. llm = Llama(model_path=model_path, n_ctx=2048, embedding=False) # --- OpenAI Structure Pydantic Models --- class ChatMessage(BaseModel): role: str content: str class ChatCompletionRequest(BaseModel): messages: List[ChatMessage] temperature: Optional[float] = 0.7 top_p: Optional[float] = 0.9 max_tokens: Optional[int] = 512 stream: Optional[bool] = False @app.get("/") def home(): return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"} @app.post("/v1/chat/completions") def chat_completions(request: ChatCompletionRequest): # 1. Format the Prompt Logging logger.info("====== NEW REQUEST RECEIVED ======") for msg in request.messages: logger.info(f"[{msg.role.upper()}]: {msg.content}") logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}") # 2. Build template manually or map roles # MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax. # llama-cpp-python can parse standard chat dictionaries directly. formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages] try: # 3. Invoke inference via llama-cpp response = llm.create_chat_completion( messages=formatted_messages, temperature=request.temperature, top_p=request.top_p, max_tokens=request.max_tokens, stream=False # Keep false for basic JSON response handling ) assistant_response = response["choices"][0]["message"]["content"] logger.info(f"[ASSISTANT]: {assistant_response}") logger.info("==================================") return response except Exception as e: logger.error(f"Inference failed: {str(e)}") raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)