Instructions to use beshkenadze/moondream3-preview-mlx-4bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use beshkenadze/moondream3-preview-mlx-4bit with MLX:
# Make sure mlx-vlm is installed # pip install --upgrade mlx-vlm from mlx_vlm import load, generate from mlx_vlm.prompt_utils import apply_chat_template from mlx_vlm.utils import load_config # Load the model model, processor = load("beshkenadze/moondream3-preview-mlx-4bit") config = load_config("beshkenadze/moondream3-preview-mlx-4bit") # Prepare input image = ["http://images.cocodataset.org/val2017/000000039769.jpg"] prompt = "Describe this image." # Apply chat template formatted_prompt = apply_chat_template( processor, config, prompt, num_images=1 ) # Generate output output = generate(model, processor, formatted_prompt, image) print(output) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- LM Studio
| from dataclasses import dataclass, field | |
| from typing import Dict, List, Optional | |
| class TextMoeConfig: | |
| num_experts: int = 64 | |
| start_layer: int = 4 | |
| experts_per_token: int = 8 | |
| expert_inner_dim: int = 1024 | |
| class TextConfig: | |
| dim: int = 2048 | |
| ff_dim: int = 8192 | |
| n_layers: int = 24 | |
| vocab_size: int = 51200 | |
| max_context: int = 4096 | |
| n_heads: int = 32 | |
| n_kv_heads: int = 32 | |
| prefix_attn: int = 730 | |
| group_size: Optional[int] = None | |
| moe: Optional[TextMoeConfig] = TextMoeConfig() | |
| class VisionConfig: | |
| enc_dim: int = 1152 | |
| enc_patch_size: int = 14 | |
| enc_n_layers: int = 27 | |
| enc_ff_dim: int = 4304 | |
| enc_n_heads: int = 16 | |
| proj_out_dim: int = 2048 | |
| crop_size: int = 378 | |
| in_channels: int = 3 | |
| max_crops: int = 12 | |
| overlap_margin: int = 4 | |
| proj_inner_dim: int = 8192 | |
| class RegionConfig: | |
| dim: int = 2048 | |
| coord_feat_dim: int = 256 | |
| coord_out_dim: int = 1024 | |
| size_feat_dim: int = 512 | |
| size_out_dim: int = 2048 | |
| group_size: Optional[int] = None | |
| class TokenizerConfig: | |
| bos_id: int = 0 | |
| eos_id: int = 0 | |
| answer_id: int = 3 | |
| thinking_id: int = 4 | |
| coord_id: int = 5 | |
| size_id: int = 6 | |
| start_ground_points_id: int = 7 | |
| end_ground_id: int = 9 | |
| templates: Dict[str, Optional[Dict[str, List[int]]]] = field( | |
| default_factory=lambda: { | |
| "caption": { | |
| "short": [1, 32708, 2, 12492, 3], | |
| "normal": [1, 32708, 2, 6382, 3], | |
| "long": [1, 32708, 2, 4059, 3], | |
| }, | |
| "query": {"prefix": [1, 15381, 2], "suffix": [3]}, | |
| "detect": {"prefix": [1, 7235, 476, 2], "suffix": [3]}, | |
| "point": {"prefix": [1, 2581, 2], "suffix": [3]}, | |
| } | |
| ) | |
| class MoondreamConfig: | |
| text: TextConfig = TextConfig() | |
| vision: VisionConfig = VisionConfig() | |
| region: RegionConfig = RegionConfig() | |
| tokenizer: TokenizerConfig = TokenizerConfig() | |
| def from_dict(cls, config_dict: dict): | |
| text_config = TextConfig(**config_dict.get("text", {})) | |
| vision_config = VisionConfig(**config_dict.get("vision", {})) | |
| region_config = RegionConfig(**config_dict.get("region", {})) | |
| tokenizer_config = TokenizerConfig(**config_dict.get("tokenizer", {})) | |
| return cls( | |
| text=text_config, | |
| vision=vision_config, | |
| region=region_config, | |
| tokenizer=tokenizer_config, | |
| ) | |
| def to_dict(self): | |
| return { | |
| "text": self.text.__dict__, | |
| "vision": self.vision.__dict__, | |
| "region": self.region.__dict__, | |
| "tokenizer": self.tokenizer.__dict__, | |
| } | |