Spaces:

CrazyQuantz
/

MiniCPM5-1B

Build error

File size: 8,479 Bytes

import os
import json
import logging
import time
from pathlib import Path
from typing import List, Tuple

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# ───────────────────────────────────────────────
# CONFIG
# ───────────────────────────────────────────────
MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF"
MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf"
N_CTX = 8192          # Context window
N_THREADS = 8         # HF Basic CPU has 8 cores
CHAT_FORMAT = "chatml"  # MiniCPM5 uses ChatML-style templates

# Logging setup
LOG_PATH = Path("/tmp/prompt_logs.jsonl")  # /tmp is writable on HF Spaces
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("minicpm5-api")

# ───────────────────────────────────────────────
# MODEL LOAD
# ───────────────────────────────────────────────
@logger.catch  # optional: use `from loguru import logger` if you prefer
def load_model():
    logger.info("Downloading/verifying GGUF...")
    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
    logger.info(f"Loading {MODEL_FILE}...")

    llm = Llama(
        model_path=model_path,
        n_ctx=N_CTX,
        n_threads=N_THREADS,
        verbose=False,
        # chat_format is handled manually below for max control
    )
    logger.info("Model loaded.")
    return llm

llm = load_model()

# ───────────────────────────────────────────────
# INFERENCE + LOGGING
# ───────────────────────────────────────────────
def log_request(
    messages: List[dict],
    params: dict,
    output: str,
    latency: float,
):
    """Append structured log entry to JSONL."""
    entry = {
        "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "messages": messages,
        "params": params,
        "output": output,
        "latency_sec": round(latency, 3),
    }
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(entry, ensure_ascii=False) + "\n")

def build_messages(
    system_msg: str,
    history: List[Tuple[str, str]],
    user_msg: str,
    enable_thinking: bool,
) -> List[dict]:
    """
    MiniCPM5 supports two modes via the chat template:
      - enable_thinking=True  -> reasoning mode
      - enable_thinking=False -> direct mode
    We simulate this by injecting a prefix/suffix in the final user message
    since llama-cpp-python's generic chat_format doesn't expose the custom
    MiniCPM5 template natively.
    """
    messages = []
    if system_msg.strip():
        messages.append({"role": "system", "content": system_msg.strip()})

    for human, assistant in history:
        messages.append({"role": "user", "content": human})
        messages.append({"role": "assistant", "content": assistant})

    # MiniCPM5 thinking trigger (documented in OpenBMB repo)
    if enable_thinking:
        user_msg = user_msg.strip() + " /think"
    else:
        user_msg = user_msg.strip() + " /no_think"

    messages.append({"role": "user", "content": user_msg})
    return messages

def generate(
    user_msg: str,
    history: List[Tuple[str, str]],
    system_msg: str,
    enable_thinking: bool,
    temperature: float,
    top_p: float,
    top_k: int,
    repeat_penalty: float,
    max_tokens: int,
    seed: int,
) -> Tuple[str, List[Tuple[str, str]], str]:
    """
    Gradio handler. Returns: (assistant_reply, updated_history, status)
    """
    start = time.time()

    # 1. Build messages
    messages = build_messages(system_msg, history, user_msg, enable_thinking)

    # 2. Call llama.cpp
    try:
        response = llm.create_chat_completion(
            messages=messages,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repeat_penalty=repeat_penalty,
            max_tokens=max_tokens,
            seed=seed,
            stream=False,
        )
        assistant_text = response["choices"][0]["message"]["content"]
    except Exception as e:
        logger.exception("Inference failed")
        return f"Error: {e}", history, "❌ Inference error"

    latency = time.time() - start

    # 3. Log
    params = {
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "repeat_penalty": repeat_penalty,
        "max_tokens": max_tokens,
        "seed": seed,
        "enable_thinking": enable_thinking,
    }
    log_request(messages, params, assistant_text, latency)
    logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s")

    # 4. Update history
    history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)]
    status = f"✅ Done in {latency:.2f}s | {len(assistant_text)} chars"
    return "", history, status

def clear_chat():
    return "", [], "Chat cleared."

# ───────────────────────────────────────────────
# GRADIO UI
# ───────────────────────────────────────────────
with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🦙 MiniCPM5-1B-GGUF (Q8_0) — CPU Inference
    **System message**, **thinking mode**, and **full sampling control** with prompt logging.
    """)

    with gr.Row():
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(label="Chat", height=450, type="messages")

            with gr.Row():
                msg_input = gr.Textbox(
                    placeholder="Type your message...",
                    show_label=False,
                    scale=4,
                )
                submit_btn = gr.Button("Send", variant="primary", scale=1)

            with gr.Row():
                clear_btn = gr.Button("Clear")
                status_box = gr.Textbox(label="Status", interactive=False)

        with gr.Column(scale=1):
            gr.Markdown("### ⚙️ Generation Parameters")

            system_msg = gr.Textbox(
                label="System Message",
                value="You are a helpful assistant.",
                lines=2,
            )
            thinking_chk = gr.Checkbox(
                label="Enable Thinking (/think)",
                value=False,
                info="MiniCPM5 reasoning mode",
            )

            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
            top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p")
            top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k")
            repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
            max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens")
            seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)")

            gr.Markdown("### 📊 Logging")
            gr.Textbox(
                value=str(LOG_PATH),
                label="Log File Path",
                interactive=False,
            )

    # Event wiring
    submit_btn.click(
        fn=generate,
        inputs=[
            msg_input, chatbot, system_msg, thinking_chk,
            temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
        ],
        outputs=[msg_input, chatbot, status_box],
    )
    msg_input.submit(
        fn=generate,
        inputs=[
            msg_input, chatbot, system_msg, thinking_chk,
            temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
        ],
        outputs=[msg_input, chatbot, status_box],
    )
    clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box])

    # ── Gradio API docs are auto-generated at /api/predict/ ──
    # You can also view them by clicking "Use via API" in the UI footer

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)