MiniCPM5-1B / app.py
CrazyQuantz's picture
Create app.py
a7de2f6 verified
import os
import json
import logging
import time
from pathlib import Path
from typing import List, Tuple
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# ───────────────────────────────────────────────
# CONFIG
# ───────────────────────────────────────────────
MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF"
MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf"
N_CTX = 8192 # Context window
N_THREADS = 8 # HF Basic CPU has 8 cores
CHAT_FORMAT = "chatml" # MiniCPM5 uses ChatML-style templates
# Logging setup
LOG_PATH = Path("/tmp/prompt_logs.jsonl") # /tmp is writable on HF Spaces
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("minicpm5-api")
# ───────────────────────────────────────────────
# MODEL LOAD
# ───────────────────────────────────────────────
@logger.catch # optional: use `from loguru import logger` if you prefer
def load_model():
logger.info("Downloading/verifying GGUF...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
logger.info(f"Loading {MODEL_FILE}...")
llm = Llama(
model_path=model_path,
n_ctx=N_CTX,
n_threads=N_THREADS,
verbose=False,
# chat_format is handled manually below for max control
)
logger.info("Model loaded.")
return llm
llm = load_model()
# ───────────────────────────────────────────────
# INFERENCE + LOGGING
# ───────────────────────────────────────────────
def log_request(
messages: List[dict],
params: dict,
output: str,
latency: float,
):
"""Append structured log entry to JSONL."""
entry = {
"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"messages": messages,
"params": params,
"output": output,
"latency_sec": round(latency, 3),
}
with open(LOG_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
def build_messages(
system_msg: str,
history: List[Tuple[str, str]],
user_msg: str,
enable_thinking: bool,
) -> List[dict]:
"""
MiniCPM5 supports two modes via the chat template:
- enable_thinking=True -> reasoning mode
- enable_thinking=False -> direct mode
We simulate this by injecting a prefix/suffix in the final user message
since llama-cpp-python's generic chat_format doesn't expose the custom
MiniCPM5 template natively.
"""
messages = []
if system_msg.strip():
messages.append({"role": "system", "content": system_msg.strip()})
for human, assistant in history:
messages.append({"role": "user", "content": human})
messages.append({"role": "assistant", "content": assistant})
# MiniCPM5 thinking trigger (documented in OpenBMB repo)
if enable_thinking:
user_msg = user_msg.strip() + " /think"
else:
user_msg = user_msg.strip() + " /no_think"
messages.append({"role": "user", "content": user_msg})
return messages
def generate(
user_msg: str,
history: List[Tuple[str, str]],
system_msg: str,
enable_thinking: bool,
temperature: float,
top_p: float,
top_k: int,
repeat_penalty: float,
max_tokens: int,
seed: int,
) -> Tuple[str, List[Tuple[str, str]], str]:
"""
Gradio handler. Returns: (assistant_reply, updated_history, status)
"""
start = time.time()
# 1. Build messages
messages = build_messages(system_msg, history, user_msg, enable_thinking)
# 2. Call llama.cpp
try:
response = llm.create_chat_completion(
messages=messages,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
max_tokens=max_tokens,
seed=seed,
stream=False,
)
assistant_text = response["choices"][0]["message"]["content"]
except Exception as e:
logger.exception("Inference failed")
return f"Error: {e}", history, "❌ Inference error"
latency = time.time() - start
# 3. Log
params = {
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"repeat_penalty": repeat_penalty,
"max_tokens": max_tokens,
"seed": seed,
"enable_thinking": enable_thinking,
}
log_request(messages, params, assistant_text, latency)
logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s")
# 4. Update history
history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)]
status = f"βœ… Done in {latency:.2f}s | {len(assistant_text)} chars"
return "", history, status
def clear_chat():
return "", [], "Chat cleared."
# ───────────────────────────────────────────────
# GRADIO UI
# ───────────────────────────────────────────────
with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ¦™ MiniCPM5-1B-GGUF (Q8_0) β€” CPU Inference
**System message**, **thinking mode**, and **full sampling control** with prompt logging.
""")
with gr.Row():
with gr.Column(scale=2):
chatbot = gr.Chatbot(label="Chat", height=450, type="messages")
with gr.Row():
msg_input = gr.Textbox(
placeholder="Type your message...",
show_label=False,
scale=4,
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Row():
clear_btn = gr.Button("Clear")
status_box = gr.Textbox(label="Status", interactive=False)
with gr.Column(scale=1):
gr.Markdown("### βš™οΈ Generation Parameters")
system_msg = gr.Textbox(
label="System Message",
value="You are a helpful assistant.",
lines=2,
)
thinking_chk = gr.Checkbox(
label="Enable Thinking (/think)",
value=False,
info="MiniCPM5 reasoning mode",
)
temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p")
top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k")
repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens")
seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)")
gr.Markdown("### πŸ“Š Logging")
gr.Textbox(
value=str(LOG_PATH),
label="Log File Path",
interactive=False,
)
# Event wiring
submit_btn.click(
fn=generate,
inputs=[
msg_input, chatbot, system_msg, thinking_chk,
temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
],
outputs=[msg_input, chatbot, status_box],
)
msg_input.submit(
fn=generate,
inputs=[
msg_input, chatbot, system_msg, thinking_chk,
temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
],
outputs=[msg_input, chatbot, status_box],
)
clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box])
# ── Gradio API docs are auto-generated at /api/predict/ ──
# You can also view them by clicking "Use via API" in the UI footer
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)