Spaces:

CrazyQuantz
/

MiniCPM5-1B-API

Running

App Files Files Community

MiniCPM5-1B-API / app.py

CrazyQuantz

Upload 5 files

ba10c2b verified 3 days ago

raw

history blame contribute delete

3.09 kB

	import os
	import logging
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel, Field
	from typing import List, Optional, Union, Dict, Any
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama

	# Set up logging to console for Hugging Face Container Logs
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger("MiniCPM-API")

	app = FastAPI(title="MiniCPM5-1B GGUF API")

	# --- Model Configuration & Download ---
	# Using an available GGUF repository for MiniCPM5-1B
	REPO_ID = "openbmb/MiniCPM5-1B-GGUF"
	FILENAME = "MiniCPM5-1B-Q8_0.gguf" # Note the capital M, C, P, M, B, and Q

	logger.info(f"Downloading model {FILENAME} from {REPO_ID}...")
	model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
	logger.info("Model downloaded successfully!")

	# Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096)
	# Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes.
	llm = Llama(model_path=model_path, n_ctx=2048, embedding=False)

	# --- OpenAI Structure Pydantic Models ---
	class ChatMessage(BaseModel):
	role: str
	content: str

	class ChatCompletionRequest(BaseModel):
	messages: List[ChatMessage]
	temperature: Optional[float] = 0.7
	top_p: Optional[float] = 0.9
	max_tokens: Optional[int] = 512
	stream: Optional[bool] = False

	@app.get("/")
	def home():
	return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"}

	@app.post("/v1/chat/completions")
	def chat_completions(request: ChatCompletionRequest):
	# 1. Format the Prompt Logging
	logger.info("====== NEW REQUEST RECEIVED ======")
	for msg in request.messages:
	logger.info(f"[{msg.role.upper()}]: {msg.content}")
	logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}")

	# 2. Build template manually or map roles
	# MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax.
	# llama-cpp-python can parse standard chat dictionaries directly.
	formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages]

	try:
	# 3. Invoke inference via llama-cpp
	response = llm.create_chat_completion(
	messages=formatted_messages,
	temperature=request.temperature,
	top_p=request.top_p,
	max_tokens=request.max_tokens,
	stream=False # Keep false for basic JSON response handling
	)

	assistant_response = response["choices"][0]["message"]["content"]
	logger.info(f"[ASSISTANT]: {assistant_response}")
	logger.info("==================================")

	return response

	except Exception as e:
	logger.error(f"Inference failed: {str(e)}")
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)