Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoModel, AutoTokenizer | |
| MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5") | |
| # Best practice: set a deterministic seed for reproducibility | |
| torch.manual_seed(100) | |
| def load_model(precision_mode="int4"): | |
| """ | |
| Load MiniCPM-V-4_5 model on CPU with chosen precision. | |
| - precision_mode: "int4" (default) quantized or "fp16" half precision emulation. | |
| Note: True FP16 tensors are not supported on CPU; we use bfloat16 or float32 fallback. | |
| """ | |
| kwargs = dict(trust_remote_code=True, attn_implementation="sdpa") | |
| if precision_mode == "int4": | |
| # BitsAndBytes is not available for CPU only in Transformers' AutoModel consistently across archs, | |
| # but MiniCPM provides CPU-friendly quantization via trust_remote_code. We'll pass load_in_4bit if supported. | |
| try: | |
| model = AutoModel.from_pretrained( | |
| MODEL_ID, | |
| load_in_4bit=True, | |
| device_map="cpu", | |
| **kwargs, | |
| ) | |
| dtype_used = "int4" | |
| except Exception: | |
| # Fallback: load in 8-bit or bf16 if 4-bit isn't supported in environment | |
| model = AutoModel.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| device_map="cpu", | |
| **kwargs, | |
| ) | |
| dtype_used = "fallback_bf16_or_fp32" | |
| else: | |
| # "fp16" requested: CPU cannot run native fp16; we emulate with bfloat16 if available, otherwise float32 | |
| # Many Intel/AMD CPUs support bfloat16 acceleration; if not, it will still run in fp32 math. | |
| model = AutoModel.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| device_map="cpu", | |
| **kwargs, | |
| ) | |
| dtype_used = "bf16_or_fp32_on_cpu_for_fp16_request" | |
| model = model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| return model, tokenizer, dtype_used | |
| # Global cache to avoid reloading each time | |
| _state = {"model": None, "tokenizer": None, "mode": None, "dtype_used": None} | |
| def ensure_model(mode): | |
| if _state["model"] is None or _state["mode"] != mode: | |
| _state["model"], _state["tokenizer"], _state["dtype_used"] = load_model(mode) | |
| _state["mode"] = mode | |
| def chat_infer(image: Image.Image, message: str, history, mode: str, enable_thinking: bool): | |
| if image is None and (not history or all((h[0] or "") == "" and (h[1] or "") == "" for h in history)): | |
| return history or [], "Please upload an image or enter a message." | |
| ensure_model(mode) | |
| model, tokenizer = _state["model"], _state["tokenizer"] | |
| # Build msgs from history and current inputs | |
| msgs = [] | |
| # Convert history into msgs | |
| # Each item in history is (user, assistant) | |
| for user_msg, assistant_msg in history or []: | |
| if user_msg: | |
| # history may not contain images; only text | |
| msgs.append({"role": "user", "content": [user_msg]}) | |
| if assistant_msg: | |
| msgs.append({"role": "assistant", "content": [assistant_msg]}) | |
| # Add current user turn | |
| user_content = [] | |
| if image is not None: | |
| # Ensure RGB | |
| if image.mode != "RGB": | |
| image = image.convert("RGB") | |
| user_content.append(image) | |
| if message and message.strip(): | |
| user_content.append(message.strip()) | |
| if not user_content: | |
| return history or [], "Please provide text or image." | |
| msgs.append({"role": "user", "content": user_content}) | |
| try: | |
| answer = model.chat( | |
| msgs=msgs, | |
| tokenizer=tokenizer, | |
| enable_thinking=enable_thinking, | |
| ) | |
| except Exception as e: | |
| return history or [], f"Inference error: {e}" | |
| # Update history for Gradio chat UI: append the latest pair | |
| history = (history or []) + [(message or "[Image]", answer)] | |
| sys_info = f"Mode: {mode} | Loaded dtype: {_state['dtype_used']} | Device: CPU" | |
| return history, sys_info | |
| def clear_history(): | |
| return [], "" | |
| with gr.Blocks(title="MiniCPM-V-4_5 CPU (int4 default, fp16 optional)", fill_height=True) as demo: | |
| gr.Markdown("# MiniCPM-V-4_5 CPU Deployment\n- Modes: int4 (default) and fp16\n- Running on CPU") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| chatbox = gr.Chatbot(height=420, label="Chat") | |
| with gr.Row(): | |
| img = gr.Image(type="pil", label="Image (optional)") | |
| msg = gr.Textbox(placeholder="Ask a question about the image or general query...", lines=3) | |
| with gr.Row(): | |
| send_btn = gr.Button("Send", variant="primary") | |
| clear_btn = gr.Button("Clear") | |
| with gr.Column(scale=1): | |
| mode = gr.Radio( | |
| choices=["int4", "fp16"], | |
| value="int4", | |
| label="Precision Mode (CPU)", | |
| info="int4 as default. fp16 uses bf16/fp32 on CPU." | |
| ) | |
| thinking = gr.Checkbox(label="Enable Thinking Mode", value=False) | |
| sys_out = gr.Markdown("") | |
| def on_send(message, image, history, mode, thinking): | |
| return chat_infer(image, message, history, mode, thinking) | |
| send_btn.click( | |
| fn=on_send, | |
| inputs=[msg, img, chatbox, mode, thinking], | |
| outputs=[chatbox, sys_out], | |
| show_progress=True, | |
| ) | |
| # Submit on Enter | |
| msg.submit( | |
| fn=on_send, | |
| inputs=[msg, img, chatbox, mode, thinking], | |
| outputs=[chatbox, sys_out], | |
| show_progress=True, | |
| ) | |
| clear_btn.click(fn=clear_history, outputs=[chatbox, sys_out]) | |
| if __name__ == "__main__": | |
| # For CPU environments with many threads, you may limit to reduce contention: | |
| torch.set_num_threads(int(os.environ.get("TORCH_NUM_THREADS", "4"))) | |
| demo.launch() |