Instructions to use tiny-random/minicpm-v-4.6 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use tiny-random/minicpm-v-4.6 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="tiny-random/minicpm-v-4.6")# Load model directly from transformers import AutoProcessor, AutoModel processor = AutoProcessor.from_pretrained("tiny-random/minicpm-v-4.6") model = AutoModel.from_pretrained("tiny-random/minicpm-v-4.6") - Notebooks
- Google Colab
- Kaggle
| library_name: transformers | |
| base_model: | |
| - openbmb/MiniCPM-V-4.6 | |
| This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6). | |
| | File path | Size | | |
| |------|------| | |
| | model.safetensors | 7.2MB | | |
| ### Example usage: | |
| ```python | |
| import torch | |
| from transformers import MiniCPMV4_6ForConditionalGeneration, AutoProcessor | |
| model_id = "tiny-random/minicpm-v-4.6" | |
| model = MiniCPMV4_6ForConditionalGeneration.from_pretrained( | |
| model_id, | |
| trust_remote_code=True, | |
| attn_implementation='sdpa', | |
| dtype=torch.bfloat16, | |
| ) | |
| model = model.eval().cuda() | |
| processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "url": "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4"}, | |
| {"type": "text", "text": "Describe this video in detail. Follow the timeline and focus on on-screen text, interface changes, main actions, and scene changes."}, | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [{"type": "text", "text": "Dummy response for video"}], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "url": "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/refract.png"}, | |
| {"type": "text", "text": "What causes this phenomenon?"}, | |
| ], | |
| }, | |
| ] | |
| downsample_mode = "16x" # Using `downsample_mode="4x"` for Finer Detail | |
| inputs = processor.apply_chat_template( | |
| messages, tokenize=True, add_generation_prompt=True, | |
| return_dict=True, return_tensors="pt", | |
| downsample_mode=downsample_mode, | |
| max_num_frames=128, | |
| stack_frames=1, | |
| max_slice_nums=1, | |
| use_image_id=False, | |
| ).to(model.device) | |
| generated_ids = model.generate(**inputs, downsample_mode=downsample_mode, max_new_tokens=32)[0] | |
| output_text = processor.decode(generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False) | |
| print(output_text.replace('<|video_pad|>', 'V').replace('<|image_pad|>', 'I')) | |
| ``` | |
| ### Codes to create this repo: | |
| <details> | |
| <summary>Click to expand</summary> | |
| ```python | |
| import json | |
| from pathlib import Path | |
| import torch | |
| from huggingface_hub import hf_hub_download | |
| from transformers import ( | |
| AutoConfig, | |
| AutoModel, | |
| AutoProcessor, | |
| AutoTokenizer, | |
| GenerationConfig, | |
| set_seed, | |
| ) | |
| source_model_id = "openbmb/MiniCPM-V-4.6" | |
| save_folder = "/tmp/tiny-random/minicpm-v-46" | |
| processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True) | |
| processor.save_pretrained(save_folder) | |
| with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: | |
| config_json: dict = json.load(f) | |
| text_config = config_json["text_config"] | |
| text_config.update({ | |
| "head_dim": 32, | |
| "hidden_size": 8, | |
| "intermediate_size": 64, | |
| "layer_types": [ | |
| "linear_attention", | |
| "linear_attention", | |
| "linear_attention", | |
| "full_attention", | |
| ], | |
| "linear_key_head_dim": 32, | |
| "linear_num_key_heads": 4, | |
| "linear_num_value_heads": 4, | |
| "linear_value_head_dim": 32, | |
| "num_attention_heads": 8, | |
| "num_hidden_layers": 4, | |
| "num_key_value_heads": 2, | |
| }) | |
| vision_config = config_json["vision_config"] | |
| vision_config.update({ | |
| "hidden_size": 128, | |
| "intermediate_size": 128, | |
| "num_attention_heads": 4, | |
| "num_hidden_layers": 2, | |
| }) | |
| with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: | |
| json.dump(config_json, f, indent=2) | |
| config = AutoConfig.from_pretrained( | |
| save_folder, | |
| trust_remote_code=True, | |
| ) | |
| print(config) | |
| torch.set_default_dtype(torch.bfloat16) | |
| model = AutoModel.from_config(config, trust_remote_code=True) | |
| torch.set_default_dtype(torch.float32) | |
| model.generation_config = GenerationConfig.from_pretrained( | |
| source_model_id, trust_remote_code=True, | |
| ) | |
| set_seed(42) | |
| num_params = sum(p.numel() for p in model.parameters()) | |
| with torch.no_grad(): | |
| for name, p in sorted(model.named_parameters()): | |
| torch.nn.init.normal_(p, 0, 0.2) | |
| print(name, p.shape, p.dtype, p.device, f'{p.numel() / num_params * 100: .2f}%') | |
| model.save_pretrained(save_folder) | |
| for f in Path(save_folder).glob('*.py'): | |
| f.unlink() | |
| ``` | |
| </details> | |
| ### Printing the model: | |
| <details><summary>Click to expand</summary> | |
| ```text | |
| MiniCPMV4_6Model( | |
| (vision_tower): MiniCPMV4_6VisionModel( | |
| (embeddings): MiniCPMV4_6VisionEmbeddings( | |
| (patch_embedding): Conv2d(3, 128, kernel_size=(14, 14), stride=(14, 14), padding=valid) | |
| (position_embedding): Embedding(4900, 128) | |
| ) | |
| (encoder): MiniCPMV4_6VisionEncoder( | |
| (layers): ModuleList( | |
| (0-1): 2 x MiniCPMV4_6VisionEncoderLayer( | |
| (layer_norm1): LayerNorm((128,), eps=1e-06, elementwise_affine=True) | |
| (self_attn): MiniCPMV4_6VisionAttention( | |
| (k_proj): Linear(in_features=128, out_features=128, bias=True) | |
| (v_proj): Linear(in_features=128, out_features=128, bias=True) | |
| (q_proj): Linear(in_features=128, out_features=128, bias=True) | |
| (out_proj): Linear(in_features=128, out_features=128, bias=True) | |
| ) | |
| (layer_norm2): LayerNorm((128,), eps=1e-06, elementwise_affine=True) | |
| (mlp): MiniCPMV4_6VisionMLP( | |
| (activation_fn): GELUTanh() | |
| (fc1): Linear(in_features=128, out_features=128, bias=True) | |
| (fc2): Linear(in_features=128, out_features=128, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| (post_layernorm): LayerNorm((128,), eps=1e-06, elementwise_affine=True) | |
| (vit_merger): MiniCPMV4_6ViTWindowAttentionMerger( | |
| (self_attn): MiniCPMV4_6VisionAttention( | |
| (k_proj): Linear(in_features=128, out_features=128, bias=True) | |
| (v_proj): Linear(in_features=128, out_features=128, bias=True) | |
| (q_proj): Linear(in_features=128, out_features=128, bias=True) | |
| (out_proj): Linear(in_features=128, out_features=128, bias=True) | |
| ) | |
| (layer_norm1): LayerNorm((128,), eps=1e-06, elementwise_affine=True) | |
| (pre_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True) | |
| (linear_1): Linear(in_features=512, out_features=512, bias=True) | |
| (act): GELUTanh() | |
| (linear_2): Linear(in_features=512, out_features=128, bias=True) | |
| ) | |
| ) | |
| (language_model): Qwen3_5TextModel( | |
| (embed_tokens): Embedding(248094, 8) | |
| (layers): ModuleList( | |
| (0-2): 3 x Qwen3_5DecoderLayer( | |
| (linear_attn): Qwen3_5GatedDeltaNet( | |
| (act): SiLUActivation() | |
| (conv1d): Conv1d(384, 384, kernel_size=(4,), stride=(1,), padding=(3,), groups=384, bias=False) | |
| (norm): Qwen3_5RMSNormGated() | |
| (out_proj): Linear(in_features=128, out_features=8, bias=False) | |
| (in_proj_qkv): Linear(in_features=8, out_features=384, bias=False) | |
| (in_proj_z): Linear(in_features=8, out_features=128, bias=False) | |
| (in_proj_b): Linear(in_features=8, out_features=4, bias=False) | |
| (in_proj_a): Linear(in_features=8, out_features=4, bias=False) | |
| ) | |
| (mlp): Qwen3_5MLP( | |
| (gate_proj): Linear(in_features=8, out_features=64, bias=False) | |
| (up_proj): Linear(in_features=8, out_features=64, bias=False) | |
| (down_proj): Linear(in_features=64, out_features=8, bias=False) | |
| (act_fn): SiLUActivation() | |
| ) | |
| (input_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) | |
| (post_attention_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) | |
| ) | |
| (3): Qwen3_5DecoderLayer( | |
| (self_attn): Qwen3_5Attention( | |
| (q_proj): Linear(in_features=8, out_features=512, bias=False) | |
| (k_proj): Linear(in_features=8, out_features=64, bias=False) | |
| (v_proj): Linear(in_features=8, out_features=64, bias=False) | |
| (o_proj): Linear(in_features=256, out_features=8, bias=False) | |
| (q_norm): Qwen3_5RMSNorm((32,), eps=1e-06) | |
| (k_norm): Qwen3_5RMSNorm((32,), eps=1e-06) | |
| ) | |
| (mlp): Qwen3_5MLP( | |
| (gate_proj): Linear(in_features=8, out_features=64, bias=False) | |
| (up_proj): Linear(in_features=8, out_features=64, bias=False) | |
| (down_proj): Linear(in_features=64, out_features=8, bias=False) | |
| (act_fn): SiLUActivation() | |
| ) | |
| (input_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) | |
| (post_attention_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) | |
| ) | |
| ) | |
| (norm): Qwen3_5RMSNorm((8,), eps=1e-06) | |
| (rotary_emb): Qwen3_5TextRotaryEmbedding() | |
| ) | |
| (merger): MiniCPMV4_6Merger( | |
| (mlp): ModuleList( | |
| (0): MiniCPMV4_6DownsampleMLP( | |
| (pre_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True) | |
| (linear_1): Linear(in_features=512, out_features=512, bias=True) | |
| (act): GELU(approximate='none') | |
| (linear_2): Linear(in_features=512, out_features=8, bias=True) | |
| ) | |
| ) | |
| ) | |
| ) | |
| ``` | |
| </details> | |
| ### Test environment: | |
| - torch: 2.10.0+cu128 | |
| - transformers: 5.9.0 |