Spaces:

CrazyQuantz
/

MiniCPM5-1B

Build error

App Files Files Community

MiniCPM5-1B / app.py

CrazyQuantz

Create app.py

a7de2f6 verified 2 days ago

raw

history blame contribute delete

8.48 kB

	import os
	import json
	import logging
	import time
	from pathlib import Path
	from typing import List, Tuple

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# ───────────────────────────────────────────────
	# CONFIG
	# ───────────────────────────────────────────────
	MODEL_REPO = "openbmb/MiniCPM5-1B-GGUF"
	MODEL_FILE = "MiniCPM5-1B-Q8_0.gguf"
	N_CTX = 8192 # Context window
	N_THREADS = 8 # HF Basic CPU has 8 cores
	CHAT_FORMAT = "chatml" # MiniCPM5 uses ChatML-style templates

	# Logging setup
	LOG_PATH = Path("/tmp/prompt_logs.jsonl") # /tmp is writable on HF Spaces
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(levelname)s \| %(message)s",
	)
	logger = logging.getLogger("minicpm5-api")

	# ───────────────────────────────────────────────
	# MODEL LOAD
	# ───────────────────────────────────────────────
	@logger.catch # optional: use `from loguru import logger` if you prefer
	def load_model():
	logger.info("Downloading/verifying GGUF...")
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
	logger.info(f"Loading {MODEL_FILE}...")

	llm = Llama(
	model_path=model_path,
	n_ctx=N_CTX,
	n_threads=N_THREADS,
	verbose=False,
	# chat_format is handled manually below for max control
	)
	logger.info("Model loaded.")
	return llm

	llm = load_model()

	# ───────────────────────────────────────────────
	# INFERENCE + LOGGING
	# ───────────────────────────────────────────────
	def log_request(
	messages: List[dict],
	params: dict,
	output: str,
	latency: float,
	):
	"""Append structured log entry to JSONL."""
	entry = {
	"timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
	"messages": messages,
	"params": params,
	"output": output,
	"latency_sec": round(latency, 3),
	}
	with open(LOG_PATH, "a", encoding="utf-8") as f:
	f.write(json.dumps(entry, ensure_ascii=False) + "\n")

	def build_messages(
	system_msg: str,
	history: List[Tuple[str, str]],
	user_msg: str,
	enable_thinking: bool,
	) -> List[dict]:
	"""
	MiniCPM5 supports two modes via the chat template:
	- enable_thinking=True -> reasoning mode
	- enable_thinking=False -> direct mode
	We simulate this by injecting a prefix/suffix in the final user message
	since llama-cpp-python's generic chat_format doesn't expose the custom
	MiniCPM5 template natively.
	"""
	messages = []
	if system_msg.strip():
	messages.append({"role": "system", "content": system_msg.strip()})

	for human, assistant in history:
	messages.append({"role": "user", "content": human})
	messages.append({"role": "assistant", "content": assistant})

	# MiniCPM5 thinking trigger (documented in OpenBMB repo)
	if enable_thinking:
	user_msg = user_msg.strip() + " /think"
	else:
	user_msg = user_msg.strip() + " /no_think"

	messages.append({"role": "user", "content": user_msg})
	return messages

	def generate(
	user_msg: str,
	history: List[Tuple[str, str]],
	system_msg: str,
	enable_thinking: bool,
	temperature: float,
	top_p: float,
	top_k: int,
	repeat_penalty: float,
	max_tokens: int,
	seed: int,
	) -> Tuple[str, List[Tuple[str, str]], str]:
	"""
	Gradio handler. Returns: (assistant_reply, updated_history, status)
	"""
	start = time.time()

	# 1. Build messages
	messages = build_messages(system_msg, history, user_msg, enable_thinking)

	# 2. Call llama.cpp
	try:
	response = llm.create_chat_completion(
	messages=messages,
	temperature=temperature,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repeat_penalty,
	max_tokens=max_tokens,
	seed=seed,
	stream=False,
	)
	assistant_text = response["choices"][0]["message"]["content"]
	except Exception as e:
	logger.exception("Inference failed")
	return f"Error: {e}", history, "❌ Inference error"

	latency = time.time() - start

	# 3. Log
	params = {
	"temperature": temperature,
	"top_p": top_p,
	"top_k": top_k,
	"repeat_penalty": repeat_penalty,
	"max_tokens": max_tokens,
	"seed": seed,
	"enable_thinking": enable_thinking,
	}
	log_request(messages, params, assistant_text, latency)
	logger.info(f"Generated {len(assistant_text)} chars in {latency:.2f}s")

	# 4. Update history
	history = history + [(user_msg.replace(" /think", "").replace(" /no_think", ""), assistant_text)]
	status = f"✅ Done in {latency:.2f}s \| {len(assistant_text)} chars"
	return "", history, status

	def clear_chat():
	return "", [], "Chat cleared."

	# ───────────────────────────────────────────────
	# GRADIO UI
	# ───────────────────────────────────────────────
	with gr.Blocks(title="MiniCPM5-1B-GGUF API", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🦙 MiniCPM5-1B-GGUF (Q8_0) — CPU Inference
	System message, thinking mode, and full sampling control with prompt logging.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="Chat", height=450, type="messages")

	with gr.Row():
	msg_input = gr.Textbox(
	placeholder="Type your message...",
	show_label=False,
	scale=4,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Row():
	clear_btn = gr.Button("Clear")
	status_box = gr.Textbox(label="Status", interactive=False)

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Generation Parameters")

	system_msg = gr.Textbox(
	label="System Message",
	value="You are a helpful assistant.",
	lines=2,
	)
	thinking_chk = gr.Checkbox(
	label="Enable Thinking (/think)",
	value=False,
	info="MiniCPM5 reasoning mode",
	)

	temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
	top_p = gr.Slider(0.0, 1.0, value=0.95, step=0.01, label="Top-p")
	top_k = gr.Slider(0, 200, value=40, step=1, label="Top-k")
	repeat_penalty = gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repeat Penalty")
	max_tokens = gr.Slider(16, 4096, value=512, step=16, label="Max Tokens")
	seed = gr.Number(value=42, precision=0, label="Seed (-1 for random)")

	gr.Markdown("### 📊 Logging")
	gr.Textbox(
	value=str(LOG_PATH),
	label="Log File Path",
	interactive=False,
	)

	# Event wiring
	submit_btn.click(
	fn=generate,
	inputs=[
	msg_input, chatbot, system_msg, thinking_chk,
	temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
	],
	outputs=[msg_input, chatbot, status_box],
	)
	msg_input.submit(
	fn=generate,
	inputs=[
	msg_input, chatbot, system_msg, thinking_chk,
	temperature, top_p, top_k, repeat_penalty, max_tokens, seed,
	],
	outputs=[msg_input, chatbot, status_box],
	)
	clear_btn.click(fn=clear_chat, outputs=[msg_input, chatbot, status_box])

	# ── Gradio API docs are auto-generated at /api/predict/ ──
	# You can also view them by clicking "Use via API" in the UI footer

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)