SmolDocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion
Paper
β’
2503.11576
β’
Published
β’
138
This model was converted to MLX format from ds4sd/SmolDocling-256M-preview using mlx-vlm version 0.1.18.
mlx-vlm.Find Working MLX + Docling Example Code Below
SmolDocling is a multimodal Image-Text-to-Text model designed for efficient document conversion. It retains Docling's most popular features while ensuring full compatibility with Docling through seamless support for DoclingDocuments.
This model was presented in the paper SmolDocling: An ultra-compact vision-language model for end-to-end multi-modal document conversion.
You can use mlx to perform inference, and Docling to convert the results to a variety of ourput formats (md, html, etc.):
# Prerequisites:
# pip install -U mlx-vlm
# pip install docling_core
import sys
from pathlib import Path
from PIL import Image
from mlx_vlm import load, apply_chat_template, stream_generate
from mlx_vlm.utils import load_image
# Variables
path_or_hf_repo="zboyles/SmolDocling-256M-preview-bf16"
output_path=Path("output")
output_path.mkdir(exist_ok=True)
# Model Params
eos="<end_of_utterance>"
verbose=True
kwargs={
"max_tokens": 8000,
"temperature": 0.0,
}
# Load images
# Note: I manually downloaded the image
# image_src = "https://upload.wikimedia.org/wikipedia/commons/7/76/GazettedeFrance.jpg"
# image = load_image(image_src)
image_src = "images/GazettedeFrance.jpg"
image = Image.open(image_src).convert("RGB")
# Initialize processor and model
model, processor = load(
path_or_hf_repo=path_or_hf_repo,
trust_remote_code=True,
)
config = model.config
# Create input messages - Docling Walkthrough Structure
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "Convert this page to docling."}
]
},
]
prompt = apply_chat_template(processor, config, messages, add_generation_prompt=True)
# # Alternatively, supported prompt creation method
# messages = [{"role": "user", "content": "Convert this page to docling."}]
# prompt = apply_chat_template(processor, config, messages, add_generation_prompt=True)
text = ""
last_response = None
for response in stream_generate(
model=model,
processor=processor,
prompt=prompt,
image=image,
**kwargs
):
if verbose:
print(response.text, end="", flush=True)
text += response.text
last_response = response
if eos in text:
text = text.split(eos)[0].strip()
break
print()
if verbose:
print("\n" + "=" * 10)
if len(text) == 0:
print("No text generated for this prompt")
sys.exit(0)
print(
f"Prompt: {last_response.prompt_tokens} tokens, "
f"{last_response.prompt_tps:.3f} tokens-per-sec"
)
print(
f"Generation: {last_response.generation_tokens} tokens, "
f"{last_response.generation_tps:.3f} tokens-per-sec"
)
print(f"Peak memory: {last_response.peak_memory:.3f} GB")
# To convert to Docling Document, MD, HTML, etc.:
docling_output_path = output_path / Path(image_src).with_suffix(".dt").name
docling_output_path.write_text(text)
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([text], [image])
doc = DoclingDocument(name="Document")
doc.load_from_doctags(doctags_doc)
# export as any format
# HTML
doc.save_as_html(docling_output_path.with_suffix(".html"))
# MD
doc.save_as_markdown(docling_output_path.with_suffix(".md"))
Thanks to @Blaizzy for the code examples that helped me quickly adapt the docling example.
Quantized
Base model
HuggingFaceTB/SmolLM2-135M