Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoModel, AutoTokenizer | |
| import numpy as np | |
| import tempfile | |
| import os | |
| from decord import VideoReader, cpu | |
| from scipy.spatial import cKDTree | |
| import math | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # Global variables for model and tokenizer | |
| model = None | |
| tokenizer = None | |
| def load_model(): | |
| """Load the MiniCPM-V-4.5 model and tokenizer""" | |
| global model, tokenizer | |
| if model is None: | |
| print("Loading MiniCPM-V-4.5 model...") | |
| model = AutoModel.from_pretrained( | |
| 'openbmb/MiniCPM-V-4_5', | |
| trust_remote_code=True, | |
| attn_implementation='sdpa', | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| model = model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| 'openbmb/MiniCPM-V-4_5', | |
| trust_remote_code=True | |
| ) | |
| print("Model loaded successfully!") | |
| return model, tokenizer | |
| def map_to_nearest_scale(values, scale): | |
| """Map values to nearest scale for temporal IDs""" | |
| tree = cKDTree(np.asarray(scale)[:, None]) | |
| _, indices = tree.query(np.asarray(values)[:, None]) | |
| return np.asarray(scale)[indices] | |
| def group_array(arr, size): | |
| """Group array into chunks of specified size""" | |
| return [arr[i:i+size] for i in range(0, len(arr), size)] | |
| def uniform_sample(l, n): | |
| """Uniformly sample n items from list l""" | |
| gap = len(l) / n | |
| idxs = [int(i * gap + gap / 2) for i in range(n)] | |
| return [l[i] for i in idxs] | |
| def encode_video(video_path, choose_fps=3, max_frames=180, max_packing=3, time_scale=0.1): | |
| """Encode video frames with temporal IDs for the model""" | |
| vr = VideoReader(video_path, ctx=cpu(0)) | |
| fps = vr.get_avg_fps() | |
| video_duration = len(vr) / fps | |
| if choose_fps * int(video_duration) <= max_frames: | |
| packing_nums = 1 | |
| choose_frames = round(min(choose_fps, round(fps)) * min(max_frames, video_duration)) | |
| else: | |
| packing_nums = math.ceil(video_duration * choose_fps / max_frames) | |
| if packing_nums <= max_packing: | |
| choose_frames = round(video_duration * choose_fps) | |
| else: | |
| choose_frames = round(max_frames * max_packing) | |
| packing_nums = max_packing | |
| frame_idx = [i for i in range(0, len(vr))] | |
| frame_idx = np.array(uniform_sample(frame_idx, choose_frames)) | |
| print(f'Video duration: {video_duration:.2f}s, frames: {len(frame_idx)}, packing: {packing_nums}') | |
| frames = vr.get_batch(frame_idx).asnumpy() | |
| frame_idx_ts = frame_idx / fps | |
| scale = np.arange(0, video_duration, time_scale) | |
| frame_ts_id = map_to_nearest_scale(frame_idx_ts, scale) / time_scale | |
| frame_ts_id = frame_ts_id.astype(np.int32) | |
| frames = [Image.fromarray(v.astype('uint8')).convert('RGB') for v in frames] | |
| frame_ts_id_group = group_array(frame_ts_id, packing_nums) | |
| return frames, frame_ts_id_group | |
| def process_input( | |
| file_input, | |
| user_prompt, | |
| system_prompt, | |
| fps, | |
| context_size, | |
| temperature, | |
| enable_thinking | |
| ): | |
| """Process user input and generate response""" | |
| try: | |
| # Load model if not already loaded | |
| model, tokenizer = load_model() | |
| if file_input is None: | |
| return "Please upload an image or video file." | |
| # Determine if input is image or video | |
| file_path = file_input.name if hasattr(file_input, 'name') else file_input | |
| file_ext = os.path.splitext(file_path)[1].lower() | |
| is_video = file_ext in ['.mp4', '.avi', '.mov', '.mkv', '.webm', '.m4v'] | |
| # Prepare messages | |
| msgs = [] | |
| # Add system prompt if provided | |
| if system_prompt and system_prompt.strip(): | |
| msgs.append({'role': 'system', 'content': system_prompt.strip()}) | |
| if is_video: | |
| # Process video | |
| frames, frame_ts_id_group = encode_video(file_path, choose_fps=fps) | |
| msgs.append({'role': 'user', 'content': frames + [user_prompt]}) | |
| # Generate response for video | |
| answer = model.chat( | |
| msgs=msgs, | |
| tokenizer=tokenizer, | |
| use_image_id=False, | |
| max_slice_nums=1, | |
| temporal_ids=frame_ts_id_group, | |
| enable_thinking=enable_thinking, | |
| max_new_tokens=context_size, | |
| temperature=temperature | |
| ) | |
| else: | |
| # Process image | |
| image = Image.open(file_path).convert('RGB') | |
| msgs.append({'role': 'user', 'content': [image, user_prompt]}) | |
| # Generate response for image | |
| answer = model.chat( | |
| msgs=msgs, | |
| tokenizer=tokenizer, | |
| enable_thinking=enable_thinking, | |
| max_new_tokens=context_size, | |
| temperature=temperature | |
| ) | |
| return answer | |
| except Exception as e: | |
| return f"Error processing input: {str(e)}" | |
| def create_interface(): | |
| """Create and configure Gradio interface""" | |
| with gr.Blocks(title="MiniCPM-V-4.5 Multimodal Chat", theme=gr.themes.Soft()) as iface: | |
| gr.Markdown(""" | |
| # ๐ MiniCPM-V-4.5 Multimodal Chat | |
| A powerful 8B parameter multimodal model that can understand images and videos with GPT-4V level performance. | |
| **Features:** | |
| - ๐ธ Single/Multi-image understanding | |
| - ๐ฅ High refresh rate video understanding (up to 10 FPS) | |
| - ๐ Strong OCR and document parsing | |
| - ๐ง Controllable fast/deep thinking mode | |
| - ๐ Multilingual support (30+ languages) | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| # File input | |
| file_input = gr.File( | |
| label="Upload Image or Video", | |
| file_types=["image", "video"], | |
| type="filepath" | |
| ) | |
| # Video FPS setting | |
| fps_slider = gr.Slider( | |
| minimum=1, | |
| maximum=30, | |
| value=5, | |
| step=1, | |
| label="Video FPS", | |
| info="Frames per second for video processing (only applies to videos)" | |
| ) | |
| # Context size | |
| context_size = gr.Slider( | |
| minimum=512, | |
| maximum=4096, | |
| value=2048, | |
| step=256, | |
| label="Max Output Tokens", | |
| info="Maximum number of tokens to generate" | |
| ) | |
| # Temperature | |
| temperature = gr.Slider( | |
| minimum=0.1, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Controls randomness in generation" | |
| ) | |
| # Thinking mode | |
| enable_thinking = gr.Checkbox( | |
| label="Enable Deep Thinking", | |
| value=False, | |
| info="Enable deep thinking mode for complex problem solving" | |
| ) | |
| with gr.Column(scale=2): | |
| # System prompt | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (Optional)", | |
| placeholder="Enter system instructions here...", | |
| lines=3, | |
| info="Set the behavior and context for the model" | |
| ) | |
| # User prompt | |
| user_prompt = gr.Textbox( | |
| label="Your Question", | |
| placeholder="Describe what you see in the image/video, or ask a specific question...", | |
| lines=4 | |
| ) | |
| # Submit button | |
| submit_btn = gr.Button("๐ Generate Response", variant="primary", size="lg") | |
| # Output | |
| output = gr.Textbox( | |
| label="Model Response", | |
| lines=15, | |
| max_lines=25, | |
| show_copy_button=True | |
| ) | |
| # Examples | |
| gr.Markdown("## ๐ก Example Prompts") | |
| gr.Examples( | |
| examples=[ | |
| ["What objects do you see in this image?"], | |
| ["Describe the scene in detail."], | |
| ["What is the main action happening in this video?"], | |
| ["Read and transcribe any text visible in the image."], | |
| ["What emotions or mood does this image convey?"], | |
| ["Analyze the composition and visual elements."], | |
| ["What might happen next in this sequence?"] | |
| ], | |
| inputs=[user_prompt], | |
| label="Click any example to use it" | |
| ) | |
| # Event handlers | |
| submit_btn.click( | |
| fn=process_input, | |
| inputs=[ | |
| file_input, | |
| user_prompt, | |
| system_prompt, | |
| fps_slider, | |
| context_size, | |
| temperature, | |
| enable_thinking | |
| ], | |
| outputs=output, | |
| show_progress=True | |
| ) | |
| # Also allow Enter key submission | |
| user_prompt.submit( | |
| fn=process_input, | |
| inputs=[ | |
| file_input, | |
| user_prompt, | |
| system_prompt, | |
| fps_slider, | |
| context_size, | |
| temperature, | |
| enable_thinking | |
| ], | |
| outputs=output, | |
| show_progress=True | |
| ) | |
| # Information section | |
| with gr.Accordion("๐ Model Information", open=False): | |
| gr.Markdown(""" | |
| ### MiniCPM-V-4.5 Specifications | |
| - **Parameters**: 8B (Qwen3-8B + SigLIP2-400M) | |
| - **Video Compression**: 96x compression rate (6 frames โ 64 tokens) | |
| - **Max Resolution**: Up to 1.8M pixels (1344x1344) | |
| - **Languages**: 30+ languages supported | |
| - **Performance**: Surpasses GPT-4o-latest on multiple benchmarks | |
| ### Usage Tips | |
| 1. **For Images**: Upload any image format and ask questions about content, objects, text, or analysis | |
| 2. **For Videos**: Adjust FPS based on video content (higher FPS for action, lower for static scenes) | |
| 3. **System Prompt**: Use to set specific roles like "You are an expert art critic" or "Analyze this from a medical perspective" | |
| 4. **Deep Thinking**: Enable for complex reasoning tasks, analysis, or problem-solving | |
| 5. **Temperature**: Lower (0.1-0.3) for factual responses, higher (0.7-1.0) for creative outputs | |
| ### Supported Formats | |
| - **Images**: JPG, PNG, JPEG, BMP, GIF, WEBP | |
| - **Videos**: MP4, AVI, MOV, MKV, WEBM, M4V | |
| """) | |
| return iface | |
| if __name__ == "__main__": | |
| # Create and launch interface | |
| demo = create_interface() | |
| demo.queue(max_size=20) | |
| demo.launch( | |
| share=True, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |