simaai
/

LFM2-VL-3B-a16w4

Image-Text-to-Text

Model card Files Files and versions

LFM2-VL-3B-a16w4 / devkit /vlm_config.json

florianvoss's picture

Upload folder using huggingface_hub

aac5f33 verified about 1 month ago

history blame contribute delete

4.01 kB

	{
	"model_name": "LFM2-VL-3B",
	"model_type": "vlm-lfm2-vl",
	"vm_cfg": {
	"model_type": "siglip2_vision_model",
	"arch": "siglip2",
	"image_size": 512,
	"patch_size": 16,
	"cls_embed": false,
	"hidden_size": 1152,
	"intermediate_size": 4304,
	"num_attention_heads": 16,
	"num_hidden_layers": 27,
	"hidden_act": "gelu_pytorch_tanh",
	"layer_norm_eps": 1e-06,
	"spatial_merge_size": 0,
	"temporal_patch_size": 0,
	"window_size": 0,
	"num_position_embeddings": 0,
	"fullatt_block_indexes": [],
	"deepstack_visual_indexes": []
	},
	"mm_cfg": {
	"num_layers": 2,
	"hidden_act": "gelu",
	"mm_tokens_per_image": 256,
	"proj_dim": 2048,
	"downsample_factor": 2
	},
	"lm_cfg": {
	"model_type": "lfm2",
	"data_type": "bfloat16",
	"arch": "lfm",
	"gen": "2",
	"size": "3b",
	"token_cfg": {
	"vocab_size": 65536
	},
	"rope_cfg": {
	"rope_theta": 1000000.0,
	"rope_local_base_freq": 1000000,
	"rope_scaling": {
	"factor": 1.0,
	"low_freq_factor": 0,
	"high_freq_factor": 0,
	"original_max_position_embeddings": 0,
	"long_factor": null,
	"short_factor": null,
	"rope_type": "default",
	"mrope_section": null,
	"mrope_interleaved": false
	}
	},
	"attn_cfg": {
	"num_attention_heads": 32,
	"num_key_value_heads": 8,
	"head_dim": 64,
	"swa_enable": false,
	"swa_ratio": 0,
	"sliding_window": 0,
	"attention_bias": false,
	"attention_dropout": 0.0,
	"query_pre_attn_scalar": 0
	},
	"mlp_cfg": {
	"intermediate_size": 10752,
	"act": "silu",
	"num_layers": 3,
	"mlp_bias": false
	},
	"hidden_size": 2048,
	"num_hidden_layers": 30,
	"max_position_embeddings": 2048,
	"rms_norm_eps": 1e-05,
	"rms_norm_unit_offset": false,
	"layer_norms": [
	"pre_attn",
	"post_attn"
	],
	"layer_types": [
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv",
	"full_attention",
	"conv",
	"conv"
	],
	"attn_logit_softcapping": null,
	"final_logit_softcapping": null,
	"lm_head_num_splits": 1,
	"lm_head_split_dim": 65536,
	"lora_cfg": null,
	"conv_L_cache": 3,
	"conv_bias": false
	},
	"pipeline_cfg": {
	"system_prompt": null,
	"chat_template": null,
	"max_num_tokens": 2048,
	"input_token_group_size": 128,
	"input_token_group_offsets": [
	0,
	128,
	256,
	384,
	512,
	640,
	768,
	896,
	1024,
	1152,
	1280,
	1408,
	1536,
	1664,
	1792,
	1920
	],
	"future_token_mask_size": 128,
	"return_logits": false,
	"use_strided_kv_cache": false,
	"enable_filter_sharing": true
	},
	"language_model_name": "LFM2-VL-3B_language",
	"vision_model_name": "LFM2-VL-3B_vision"
	}