--- library_name: transformers base_model: - openbmb/MiniCPM-V-4.6 --- This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6). | File path | Size | |------|------| | model.safetensors | 7.2MB | ### Example usage: ```python import torch from transformers import MiniCPMV4_6ForConditionalGeneration, AutoProcessor model_id = "tiny-random/minicpm-v-4.6" model = MiniCPMV4_6ForConditionalGeneration.from_pretrained( model_id, trust_remote_code=True, attn_implementation='sdpa', dtype=torch.bfloat16, ) model = model.eval().cuda() processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "video", "url": "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/football.mp4"}, {"type": "text", "text": "Describe this video in detail. Follow the timeline and focus on on-screen text, interface changes, main actions, and scene changes."}, ], }, { "role": "assistant", "content": [{"type": "text", "text": "Dummy response for video"}], }, { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/openbmb/DemoCase/resolve/main/refract.png"}, {"type": "text", "text": "What causes this phenomenon?"}, ], }, ] downsample_mode = "16x" # Using `downsample_mode="4x"` for Finer Detail inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt", downsample_mode=downsample_mode, max_num_frames=128, stack_frames=1, max_slice_nums=1, use_image_id=False, ).to(model.device) generated_ids = model.generate(**inputs, downsample_mode=downsample_mode, max_new_tokens=32)[0] output_text = processor.decode(generated_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False) print(output_text.replace('<|video_pad|>', 'V').replace('<|image_pad|>', 'I')) ``` ### Codes to create this repo:
Click to expand ```python import json from pathlib import Path import torch from huggingface_hub import hf_hub_download from transformers import ( AutoConfig, AutoModel, AutoProcessor, AutoTokenizer, GenerationConfig, set_seed, ) source_model_id = "openbmb/MiniCPM-V-4.6" save_folder = "/tmp/tiny-random/minicpm-v-46" processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True) processor.save_pretrained(save_folder) with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f: config_json: dict = json.load(f) text_config = config_json["text_config"] text_config.update({ "head_dim": 32, "hidden_size": 8, "intermediate_size": 64, "layer_types": [ "linear_attention", "linear_attention", "linear_attention", "full_attention", ], "linear_key_head_dim": 32, "linear_num_key_heads": 4, "linear_num_value_heads": 4, "linear_value_head_dim": 32, "num_attention_heads": 8, "num_hidden_layers": 4, "num_key_value_heads": 2, }) vision_config = config_json["vision_config"] vision_config.update({ "hidden_size": 128, "intermediate_size": 128, "num_attention_heads": 4, "num_hidden_layers": 2, }) with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f: json.dump(config_json, f, indent=2) config = AutoConfig.from_pretrained( save_folder, trust_remote_code=True, ) print(config) torch.set_default_dtype(torch.bfloat16) model = AutoModel.from_config(config, trust_remote_code=True) torch.set_default_dtype(torch.float32) model.generation_config = GenerationConfig.from_pretrained( source_model_id, trust_remote_code=True, ) set_seed(42) num_params = sum(p.numel() for p in model.parameters()) with torch.no_grad(): for name, p in sorted(model.named_parameters()): torch.nn.init.normal_(p, 0, 0.2) print(name, p.shape, p.dtype, p.device, f'{p.numel() / num_params * 100: .2f}%') model.save_pretrained(save_folder) for f in Path(save_folder).glob('*.py'): f.unlink() ```
### Printing the model:
Click to expand ```text MiniCPMV4_6Model( (vision_tower): MiniCPMV4_6VisionModel( (embeddings): MiniCPMV4_6VisionEmbeddings( (patch_embedding): Conv2d(3, 128, kernel_size=(14, 14), stride=(14, 14), padding=valid) (position_embedding): Embedding(4900, 128) ) (encoder): MiniCPMV4_6VisionEncoder( (layers): ModuleList( (0-1): 2 x MiniCPMV4_6VisionEncoderLayer( (layer_norm1): LayerNorm((128,), eps=1e-06, elementwise_affine=True) (self_attn): MiniCPMV4_6VisionAttention( (k_proj): Linear(in_features=128, out_features=128, bias=True) (v_proj): Linear(in_features=128, out_features=128, bias=True) (q_proj): Linear(in_features=128, out_features=128, bias=True) (out_proj): Linear(in_features=128, out_features=128, bias=True) ) (layer_norm2): LayerNorm((128,), eps=1e-06, elementwise_affine=True) (mlp): MiniCPMV4_6VisionMLP( (activation_fn): GELUTanh() (fc1): Linear(in_features=128, out_features=128, bias=True) (fc2): Linear(in_features=128, out_features=128, bias=True) ) ) ) ) (post_layernorm): LayerNorm((128,), eps=1e-06, elementwise_affine=True) (vit_merger): MiniCPMV4_6ViTWindowAttentionMerger( (self_attn): MiniCPMV4_6VisionAttention( (k_proj): Linear(in_features=128, out_features=128, bias=True) (v_proj): Linear(in_features=128, out_features=128, bias=True) (q_proj): Linear(in_features=128, out_features=128, bias=True) (out_proj): Linear(in_features=128, out_features=128, bias=True) ) (layer_norm1): LayerNorm((128,), eps=1e-06, elementwise_affine=True) (pre_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True) (linear_1): Linear(in_features=512, out_features=512, bias=True) (act): GELUTanh() (linear_2): Linear(in_features=512, out_features=128, bias=True) ) ) (language_model): Qwen3_5TextModel( (embed_tokens): Embedding(248094, 8) (layers): ModuleList( (0-2): 3 x Qwen3_5DecoderLayer( (linear_attn): Qwen3_5GatedDeltaNet( (act): SiLUActivation() (conv1d): Conv1d(384, 384, kernel_size=(4,), stride=(1,), padding=(3,), groups=384, bias=False) (norm): Qwen3_5RMSNormGated() (out_proj): Linear(in_features=128, out_features=8, bias=False) (in_proj_qkv): Linear(in_features=8, out_features=384, bias=False) (in_proj_z): Linear(in_features=8, out_features=128, bias=False) (in_proj_b): Linear(in_features=8, out_features=4, bias=False) (in_proj_a): Linear(in_features=8, out_features=4, bias=False) ) (mlp): Qwen3_5MLP( (gate_proj): Linear(in_features=8, out_features=64, bias=False) (up_proj): Linear(in_features=8, out_features=64, bias=False) (down_proj): Linear(in_features=64, out_features=8, bias=False) (act_fn): SiLUActivation() ) (input_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) (post_attention_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) ) (3): Qwen3_5DecoderLayer( (self_attn): Qwen3_5Attention( (q_proj): Linear(in_features=8, out_features=512, bias=False) (k_proj): Linear(in_features=8, out_features=64, bias=False) (v_proj): Linear(in_features=8, out_features=64, bias=False) (o_proj): Linear(in_features=256, out_features=8, bias=False) (q_norm): Qwen3_5RMSNorm((32,), eps=1e-06) (k_norm): Qwen3_5RMSNorm((32,), eps=1e-06) ) (mlp): Qwen3_5MLP( (gate_proj): Linear(in_features=8, out_features=64, bias=False) (up_proj): Linear(in_features=8, out_features=64, bias=False) (down_proj): Linear(in_features=64, out_features=8, bias=False) (act_fn): SiLUActivation() ) (input_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) (post_attention_layernorm): Qwen3_5RMSNorm((8,), eps=1e-06) ) ) (norm): Qwen3_5RMSNorm((8,), eps=1e-06) (rotary_emb): Qwen3_5TextRotaryEmbedding() ) (merger): MiniCPMV4_6Merger( (mlp): ModuleList( (0): MiniCPMV4_6DownsampleMLP( (pre_norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True) (linear_1): Linear(in_features=512, out_features=512, bias=True) (act): GELU(approximate='none') (linear_2): Linear(in_features=512, out_features=8, bias=True) ) ) ) ) ```
### Test environment: - torch: 2.10.0+cu128 - transformers: 5.9.0