import os import json import logging import time from pathlib import Path from typing import List, Tuple import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # ─────────────────────────────────────────────── # CONFIG # ─────────────────────────────────────────────── MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF" MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf" N_CTX = 8192 # Context window N_THREADS = 8 # HF Basic CPU has 8 cores CHAT_FORMAT = "chatml" # MiniCPM5 uses ChatML-style templates # Logging setup LOG_PATH = Path("/tmp/prompt_logs.jsonl") # /tmp is writable on HF Spaces logging.basicConfig( level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s", ) logger = logging.getLogger("minicpm5-api") # ─────────────────────────────────────────────── # MODEL LOAD # ─────────────────────────────────────────────── @logger.catch # optional: use `from loguru import logger` if you prefer def load_model(): logger.info("Downloading/verifying GGUF...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) logger.info(f"Loading {MODEL_FILE}...") llm = Llama( model_path=model_path, n_ctx=N_CTX, n_threads=N_THREADS, verbose=False, # chat_format is handled manually below for max control ) logger.info("Model loaded.") return llm llm = load_model() # ─────────────────────────────────────────────── # INFERENCE + LOGGING # ─────────────────────────────────────────────── def log_request( messages: List[dict], params: dict, output: str, latency: float, ): """Append structured log entry to JSONL.""" entry = { "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "messages": messages, "params": params, "output": output, "latency_sec": round(latency, 3), } with open(LOG_PATH, "a", encoding="utf-8") as f: f.write(json.dumps(entry, ensure_ascii=False) + "\n") def build_messages( system_msg: str, history: List[Tuple[str, str]], user_msg: str, enable_thinking: bool, ) -> List[dict]: """ MiniCPM5 supports two modes via the chat template: - enable_thinking=True -> reasoning mode - enable_thinking=False -> direct mode We simulate this by injecting a prefix/suffix in the final user message since llama-cpp-python's generic chat_format doesn't expose the custom MiniCPM5 template natively. """ messages = [] if system_msg.strip(): messages.append({"role": "system", "content": system_msg.strip()}) for human, assistant in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": assistant}) # MiniCPM5 thinking trigger (documented in OpenBMB repo) if enable_thinking: user_msg = user_msg.strip() + " /think" else: user_msg = user_msg.strip() + " /no_think" messages.append({"role": "user", "content": user_msg}) return messages def generate( user_msg: str, history: List[Tuple[str, str]], system_msg: str, enable_thinking: bool, temperature: float, top_p: float, top_k: int, repeat_penalty: float, max_tokens: int, seed: int, ) -> Tuple[str, List[Tuple[str, str]], str]: """ Gradio handler. Returns: (assistant_reply, updated_history, status) """ start = time.time() # 1. Build messages messages = build_messages(system_msg, history, user_msg, enable_thinking) # 2. Call llama.cpp try: response = llm.create_chat_completion( messages=messages, temperature=temperature, top_p=top_p, top_k=top_k, repeat_penalty=repeat_penalty, max_tokens=max_tokens, seed=seed, stream=False, ) assistant_text = response["choices"][0]["message"]["content"] except Exception as e: logger.exception("Inference failed") return f"Error: {e}", history, "❌ Inference error" latency = time.time() - start # 3. Log params = { "temperature": temperature, "top_p": top_p, "top_k": top_k, "repeat_penalty": repeat_penalty, "max_tokens": max_tokens, "seed": seed, "enable_thinking": enable_thinking, } log_request(messages, params, assistant_text, latency) logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s") # 4. Update history history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)] status = f"✅ Done in {latency:.2f}s | {len(assistant_text)} chars" return "", history, status def clear_chat(): return "", [], "Chat cleared." # ─────────────────────────────────────────────── # GRADIO UI # ─────────────────────────────────────────────── with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🦙 MiniCPM5-1B-GGUF (Q8_0) — CPU Inference **System message**, **thinking mode**, and **full sampling control** with prompt logging. """) with gr.Row(): with gr.Column(scale=2): chatbot = gr.Chatbot(label="Chat", height=450, type="messages") with gr.Row(): msg_input = gr.Textbox( placeholder="Type your message...", show_label=False, scale=4, ) submit_btn = gr.Button("Send", variant="primary", scale=1) with gr.Row(): clear_btn = gr.Button("Clear") status_box = gr.Textbox(label="Status", interactive=False) with gr.Column(scale=1): gr.Markdown("### ⚙️ Generation Parameters") system_msg = gr.Textbox( label="System Message", value="You are a helpful assistant.", lines=2, ) thinking_chk = gr.Checkbox( label="Enable Thinking (/think)", value=False, info="MiniCPM5 reasoning mode", ) temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature") top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p") top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k") repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty") max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens") seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)") gr.Markdown("### 📊 Logging") gr.Textbox( value=str(LOG_PATH), label="Log File Path", interactive=False, ) # Event wiring submit_btn.click( fn=generate, inputs=[ msg_input, chatbot, system_msg, thinking_chk, temperature, top_p, top_k, repeat_penalty, max_tokens, seed, ], outputs=[msg_input, chatbot, status_box], ) msg_input.submit( fn=generate, inputs=[ msg_input, chatbot, system_msg, thinking_chk, temperature, top_p, top_k, repeat_penalty, max_tokens, seed, ], outputs=[msg_input, chatbot, status_box], ) clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box]) # ── Gradio API docs are auto-generated at /api/predict/ ── # You can also view them by clicking "Use via API" in the UI footer if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)