Spaces:
Running
Running
| import os | |
| import logging | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel, Field | |
| from typing import List, Optional, Union, Dict, Any | |
| from huggingface_hub import hf_hub_download | |
| from llama_cpp import Llama | |
| # Set up logging to console for Hugging Face Container Logs | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
| logger = logging.getLogger("MiniCPM-API") | |
| app = FastAPI(title="MiniCPM5-1B GGUF API") | |
| # --- Model Configuration & Download --- | |
| # Using an available GGUF repository for MiniCPM5-1B | |
| REPO_ID = "openbmb/MiniCPM5-1B-GGUF" | |
| FILENAME = "MiniCPM5-1B-Q8_0.gguf" # Note the capital M, C, P, M, B, and Q | |
| logger.info(f"Downloading model {FILENAME} from {REPO_ID}...") | |
| model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) | |
| logger.info("Model downloaded successfully!") | |
| # Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096) | |
| # Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes. | |
| llm = Llama(model_path=model_path, n_ctx=2048, embedding=False) | |
| # --- OpenAI Structure Pydantic Models --- | |
| class ChatMessage(BaseModel): | |
| role: str | |
| content: str | |
| class ChatCompletionRequest(BaseModel): | |
| messages: List[ChatMessage] | |
| temperature: Optional[float] = 0.7 | |
| top_p: Optional[float] = 0.9 | |
| max_tokens: Optional[int] = 512 | |
| stream: Optional[bool] = False | |
| def home(): | |
| return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"} | |
| def chat_completions(request: ChatCompletionRequest): | |
| # 1. Format the Prompt Logging | |
| logger.info("====== NEW REQUEST RECEIVED ======") | |
| for msg in request.messages: | |
| logger.info(f"[{msg.role.upper()}]: {msg.content}") | |
| logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}") | |
| # 2. Build template manually or map roles | |
| # MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax. | |
| # llama-cpp-python can parse standard chat dictionaries directly. | |
| formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages] | |
| try: | |
| # 3. Invoke inference via llama-cpp | |
| response = llm.create_chat_completion( | |
| messages=formatted_messages, | |
| temperature=request.temperature, | |
| top_p=request.top_p, | |
| max_tokens=request.max_tokens, | |
| stream=False # Keep false for basic JSON response handling | |
| ) | |
| assistant_response = response["choices"][0]["message"]["content"] | |
| logger.info(f"[ASSISTANT]: {assistant_response}") | |
| logger.info("==================================") | |
| return response | |
| except Exception as e: | |
| logger.error(f"Inference failed: {str(e)}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |