Spaces:

CGQN
/

MiniCPM-V-4_5-from_gpt5

Sleeping

App Files Files Community

MiniCPM-V-4_5-from_gpt5 / app.py

CGQN

Upload app.py with huggingface_hub

e7b1930 verified 4 months ago

raw

history blame contribute delete

6 kB

	import os
	import gradio as gr
	import torch
	from PIL import Image
	from transformers import AutoModel, AutoTokenizer

	MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5")

	# Best practice: set a deterministic seed for reproducibility
	torch.manual_seed(100)

	def load_model(precision_mode="int4"):
	"""
	Load MiniCPM-V-4_5 model on CPU with chosen precision.
	- precision_mode: "int4" (default) quantized or "fp16" half precision emulation.
	Note: True FP16 tensors are not supported on CPU; we use bfloat16 or float32 fallback.
	"""
	kwargs = dict(trust_remote_code=True, attn_implementation="sdpa")

	if precision_mode == "int4":
	# BitsAndBytes is not available for CPU only in Transformers' AutoModel consistently across archs,
	# but MiniCPM provides CPU-friendly quantization via trust_remote_code. We'll pass load_in_4bit if supported.
	try:
	model = AutoModel.from_pretrained(
	MODEL_ID,
	load_in_4bit=True,
	device_map="cpu",
	**kwargs,
	)
	dtype_used = "int4"
	except Exception:
	# Fallback: load in 8-bit or bf16 if 4-bit isn't supported in environment
	model = AutoModel.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	device_map="cpu",
	**kwargs,
	)
	dtype_used = "fallback_bf16_or_fp32"
	else:
	# "fp16" requested: CPU cannot run native fp16; we emulate with bfloat16 if available, otherwise float32
	# Many Intel/AMD CPUs support bfloat16 acceleration; if not, it will still run in fp32 math.
	model = AutoModel.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
	device_map="cpu",
	**kwargs,
	)
	dtype_used = "bf16_or_fp32_on_cpu_for_fp16_request"

	model = model.eval()
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	return model, tokenizer, dtype_used

	# Global cache to avoid reloading each time
	_state = {"model": None, "tokenizer": None, "mode": None, "dtype_used": None}

	def ensure_model(mode):
	if _state["model"] is None or _state["mode"] != mode:
	_state["model"], _state["tokenizer"], _state["dtype_used"] = load_model(mode)
	_state["mode"] = mode

	def chat_infer(image: Image.Image, message: str, history, mode: str, enable_thinking: bool):
	if image is None and (not history or all((h[0] or "") == "" and (h[1] or "") == "" for h in history)):
	return history or [], "Please upload an image or enter a message."
	ensure_model(mode)
	model, tokenizer = _state["model"], _state["tokenizer"]

	# Build msgs from history and current inputs
	msgs = []
	# Convert history into msgs
	# Each item in history is (user, assistant)
	for user_msg, assistant_msg in history or []:
	if user_msg:
	# history may not contain images; only text
	msgs.append({"role": "user", "content": [user_msg]})
	if assistant_msg:
	msgs.append({"role": "assistant", "content": [assistant_msg]})

	# Add current user turn
	user_content = []
	if image is not None:
	# Ensure RGB
	if image.mode != "RGB":
	image = image.convert("RGB")
	user_content.append(image)
	if message and message.strip():
	user_content.append(message.strip())
	if not user_content:
	return history or [], "Please provide text or image."

	msgs.append({"role": "user", "content": user_content})

	try:
	answer = model.chat(
	msgs=msgs,
	tokenizer=tokenizer,
	enable_thinking=enable_thinking,
	)
	except Exception as e:
	return history or [], f"Inference error: {e}"

	# Update history for Gradio chat UI: append the latest pair
	history = (history or []) + [(message or "[Image]", answer)]
	sys_info = f"Mode: {mode} \| Loaded dtype: {_state['dtype_used']} \| Device: CPU"
	return history, sys_info

	def clear_history():
	return [], ""

	with gr.Blocks(title="MiniCPM-V-4_5 CPU (int4 default, fp16 optional)", fill_height=True) as demo:
	gr.Markdown("# MiniCPM-V-4_5 CPU Deployment\n- Modes: int4 (default) and fp16\n- Running on CPU")

	with gr.Row():
	with gr.Column(scale=2):
	chatbox = gr.Chatbot(height=420, label="Chat")
	with gr.Row():
	img = gr.Image(type="pil", label="Image (optional)")
	msg = gr.Textbox(placeholder="Ask a question about the image or general query...", lines=3)
	with gr.Row():
	send_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear")
	with gr.Column(scale=1):
	mode = gr.Radio(
	choices=["int4", "fp16"],
	value="int4",
	label="Precision Mode (CPU)",
	info="int4 as default. fp16 uses bf16/fp32 on CPU."
	)
	thinking = gr.Checkbox(label="Enable Thinking Mode", value=False)
	sys_out = gr.Markdown("")

	def on_send(message, image, history, mode, thinking):
	return chat_infer(image, message, history, mode, thinking)

	send_btn.click(
	fn=on_send,
	inputs=[msg, img, chatbox, mode, thinking],
	outputs=[chatbox, sys_out],
	show_progress=True,
	)

	# Submit on Enter
	msg.submit(
	fn=on_send,
	inputs=[msg, img, chatbox, mode, thinking],
	outputs=[chatbox, sys_out],
	show_progress=True,
	)

	clear_btn.click(fn=clear_history, outputs=[chatbox, sys_out])

	if __name__ == "__main__":
	# For CPU environments with many threads, you may limit to reduce contention:
	torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4")))
	demo.launch()