Instructions to use mlx-community/FastVLM-0.5B-bf16 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mlx-community/FastVLM-0.5B-bf16 with MLX:
# Make sure mlx-vlm is installed # pip install --upgrade mlx-vlm from mlx_vlm import load, generate from mlx_vlm.prompt_utils import apply_chat_template from mlx_vlm.utils import load_config # Load the model model, processor = load("mlx-community/FastVLM-0.5B-bf16") config = load_config("mlx-community/FastVLM-0.5B-bf16") # Prepare input image = ["http://images.cocodataset.org/val2017/000000039769.jpg"] prompt = "Describe this image." # Apply chat template formatted_prompt = apply_chat_template( processor, config, prompt, num_images=1 ) # Generate output output = generate(model, processor, formatted_prompt, image) print(output) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
Add vision_config
#3
by pcuenq HF Staff - opened
- config.json +62 -1
config.json
CHANGED
|
@@ -126,5 +126,66 @@
|
|
| 126 |
"use_cache": true,
|
| 127 |
"use_mm_proj": true,
|
| 128 |
"use_sliding_window": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
"vocab_size": 151936
|
| 130 |
-
}
|
|
|
|
| 126 |
"use_cache": true,
|
| 127 |
"use_mm_proj": true,
|
| 128 |
"use_sliding_window": false,
|
| 129 |
+
"vision_config": {
|
| 130 |
+
"cls_ratio": 2.0,
|
| 131 |
+
"down_patch_size": 7,
|
| 132 |
+
"down_stride": 2,
|
| 133 |
+
"downsamples": [
|
| 134 |
+
true,
|
| 135 |
+
true,
|
| 136 |
+
true,
|
| 137 |
+
true,
|
| 138 |
+
true
|
| 139 |
+
],
|
| 140 |
+
"embed_dims": [
|
| 141 |
+
96,
|
| 142 |
+
192,
|
| 143 |
+
384,
|
| 144 |
+
768,
|
| 145 |
+
1536
|
| 146 |
+
],
|
| 147 |
+
"hidden_size": 1024,
|
| 148 |
+
"image_size": 1024,
|
| 149 |
+
"intermediate_size": 3072,
|
| 150 |
+
"layer_scale_init_value": 1e-05,
|
| 151 |
+
"layers": [
|
| 152 |
+
2,
|
| 153 |
+
12,
|
| 154 |
+
24,
|
| 155 |
+
4,
|
| 156 |
+
2
|
| 157 |
+
],
|
| 158 |
+
"mlp_ratios": [
|
| 159 |
+
4,
|
| 160 |
+
4,
|
| 161 |
+
4,
|
| 162 |
+
4,
|
| 163 |
+
4
|
| 164 |
+
],
|
| 165 |
+
"num_classes": 1000,
|
| 166 |
+
"patch_size": 64,
|
| 167 |
+
"pos_embs_shapes": [
|
| 168 |
+
null,
|
| 169 |
+
null,
|
| 170 |
+
null,
|
| 171 |
+
[
|
| 172 |
+
7,
|
| 173 |
+
7
|
| 174 |
+
],
|
| 175 |
+
[
|
| 176 |
+
7,
|
| 177 |
+
7
|
| 178 |
+
]
|
| 179 |
+
],
|
| 180 |
+
"projection_dim": 768,
|
| 181 |
+
"repmixer_kernel_size": 3,
|
| 182 |
+
"token_mixers": [
|
| 183 |
+
"repmixer",
|
| 184 |
+
"repmixer",
|
| 185 |
+
"repmixer",
|
| 186 |
+
"attention",
|
| 187 |
+
"attention"
|
| 188 |
+
]
|
| 189 |
+
},
|
| 190 |
"vocab_size": 151936
|
| 191 |
+
}
|