File size: 3,094 Bytes
ba10c2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import os
import logging
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from typing import List, Optional, Union, Dict, Any
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Set up logging to console for Hugging Face Container Logs
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("MiniCPM-API")

app = FastAPI(title="MiniCPM5-1B GGUF API")

# --- Model Configuration & Download ---
# Using an available GGUF repository for MiniCPM5-1B
REPO_ID = "openbmb/MiniCPM5-1B-GGUF" 
FILENAME = "MiniCPM5-1B-Q8_0.gguf"  # Note the capital M, C, P, M, B, and Q

logger.info(f"Downloading model {FILENAME} from {REPO_ID}...")
model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
logger.info("Model downloaded successfully!")

# Initialize Llama.cpp with a reasonable context window for CPU (e.g., 2048 or 4096)
# Note: MiniCPM5 natively supports up to 131k, but on a Basic CPU, keep it modest to avoid OOM crashes.
llm = Llama(model_path=model_path, n_ctx=2048, embedding=False)

# --- OpenAI Structure Pydantic Models ---
class ChatMessage(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    messages: List[ChatMessage]
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 0.9
    max_tokens: Optional[int] = 512
    stream: Optional[bool] = False

@app.get("/")
def home():
    return {"status": "healthy", "model": "MiniCPM5-1B-Q8_0"}

@app.post("/v1/chat/completions")
def chat_completions(request: ChatCompletionRequest):
    # 1. Format the Prompt Logging
    logger.info("====== NEW REQUEST RECEIVED ======")
    for msg in request.messages:
        logger.info(f"[{msg.role.upper()}]: {msg.content}")
    logger.info(f"Parameters -> Temp: {request.temperature}, Top_P: {request.top_p}, Max Tokens: {request.max_tokens}")
    
    # 2. Build template manually or map roles
    # MiniCPM5-1B uses standard Llama-style formatting or built-in chat syntax.
    # llama-cpp-python can parse standard chat dictionaries directly.
    formatted_messages = [{"role": m.role, "content": m.content} for m in request.messages]
    
    try:
        # 3. Invoke inference via llama-cpp
        response = llm.create_chat_completion(
            messages=formatted_messages,
            temperature=request.temperature,
            top_p=request.top_p,
            max_tokens=request.max_tokens,
            stream=False # Keep false for basic JSON response handling
        )
        
        assistant_response = response["choices"][0]["message"]["content"]
        logger.info(f"[ASSISTANT]: {assistant_response}")
        logger.info("==================================")
        
        return response

    except Exception as e:
        logger.error(f"Inference failed: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=7860)