{ "bomFormat": "CycloneDX", "specVersion": "1.6", "serialNumber": "urn:uuid:2f96c755-4887-4315-83f9-22de2e18b301", "version": 1, "metadata": { "timestamp": "2025-06-05T09:37:43.782903+00:00", "component": { "type": "machine-learning-model", "bom-ref": "HuggingFaceTB/SmolVLM2-2.2B-Instruct-7235ccb9-479b-53f2-94fa-6512d4edebb3", "name": "HuggingFaceTB/SmolVLM2-2.2B-Instruct", "externalReferences": [ { "url": "https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct", "type": "documentation" } ], "modelCard": { "modelParameters": { "task": "image-text-to-text", "architectureFamily": "smolvlm", "modelArchitecture": "SmolVLMForConditionalGeneration", "datasets": [ { "ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687" }, { "ref": "HuggingFaceM4/Docmatix-1623432e-1ae3-5888-a9c9-23e566c5a05a" }, { "ref": "lmms-lab/LLaVA-OneVision-Data-5dedde7e-a93e-51b5-99ad-6e589118a200" }, { "ref": "lmms-lab/M4-Instruct-Data-67437f1e-35b0-5a44-8de0-217c202cacf0" }, { "ref": "HuggingFaceFV/finevideo-b6414f8b-8137-53ce-bed4-674af789a978" }, { "ref": "MAmmoTH-VL/MAmmoTH-VL-Instruct-12M-2c6bc921-c3e2-5525-97ec-d8782f1581e5" }, { "ref": "lmms-lab/LLaVA-Video-178K-3489c331-beb5-56f7-87d9-67a7c5d6437c" }, { "ref": "orrzohar/Video-STaR-7f15539e-ebd4-570c-92c7-ffffc5190a49" }, { "ref": "Mutonix/Vript-4215585f-ed48-5399-8b8c-640ad24fcb01" }, { "ref": "TIGER-Lab/VISTA-400K-ce87b7d4-1f05-5464-89f5-82ca09d29429" }, { "ref": "Enxin/MovieChat-1K_train-fa5b461a-f000-517a-8048-3363781aa109" }, { "ref": "ShareGPT4Video/ShareGPT4Video-4ef9dfce-9cf4-5a48-a489-ac850636b73d" } ] }, "properties": [ { "name": "library_name", "value": "transformers" }, { "name": "base_model", "value": "HuggingFaceTB/SmolVLM-Instruct" } ], "consideration": { "useCases": "SmolVLM2 can be used for inference on multimodal (video / image / text) tasks where the input consists of text queries along with video or one or more images. Text and media files can be interleaved arbitrarily, enabling tasks like captioning, visual question answering, and storytelling based on visual content. The model does not support image or video generation.To fine-tune SmolVLM2 on a specific task, you can follow [the fine-tuning tutorial](https://github.com/huggingface/smollm/blob/main/vision/finetuning/Smol_VLM_FT.ipynb)." } }, "authors": [ { "name": "HuggingFaceTB" } ], "licenses": [ { "license": { "id": "Apache-2.0", "url": "https://spdx.org/licenses/Apache-2.0.html" } } ], "description": "- **Developed by:** Hugging Face \ud83e\udd17- **Model type:** Multi-modal model (image/multi-image/video/text)- **Language(s) (NLP):** English- **License:** Apache 2.0- **Architecture:** Based on [Idefics3](https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3) (see technical summary)", "tags": [ "transformers", "safetensors", "smolvlm", "image-text-to-text", "video-text-to-text", "conversational", "en", "dataset:HuggingFaceM4/the_cauldron", "dataset:HuggingFaceM4/Docmatix", "dataset:lmms-lab/LLaVA-OneVision-Data", "dataset:lmms-lab/M4-Instruct-Data", "dataset:HuggingFaceFV/finevideo", "dataset:MAmmoTH-VL/MAmmoTH-VL-Instruct-12M", "dataset:lmms-lab/LLaVA-Video-178K", "dataset:orrzohar/Video-STaR", "dataset:Mutonix/Vript", "dataset:TIGER-Lab/VISTA-400K", "dataset:Enxin/MovieChat-1K_train", "dataset:ShareGPT4Video/ShareGPT4Video", "arxiv:2504.05299", "base_model:HuggingFaceTB/SmolVLM-Instruct", "base_model:finetune:HuggingFaceTB/SmolVLM-Instruct", "license:apache-2.0", "endpoints_compatible", "region:us" ] } }, "components": [ { "type": "data", "bom-ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687", "name": "HuggingFaceM4/the_cauldron", "data": [ { "type": "dataset", "bom-ref": "HuggingFaceM4/the_cauldron-0b60b937-29a7-5f0c-9fa6-ec10bf894687", "name": "HuggingFaceM4/the_cauldron", "contents": { "url": "https://huggingface.co/datasets/HuggingFaceM4/the_cauldron", "properties": [ { "name": "configs", "value": "Name of the dataset subset: ai2d {\"split\": \"train\", \"path\": \"ai2d/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: aokvqa {\"split\": \"train\", \"path\": \"aokvqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: chart2text {\"split\": \"train\", \"path\": \"chart2text/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: chartqa {\"split\": \"train\", \"path\": \"chartqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: clevr {\"split\": \"train\", \"path\": \"clevr/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: clevr_math {\"split\": \"train\", \"path\": \"clevr_math/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: cocoqa {\"split\": \"train\", \"path\": \"cocoqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: datikz {\"split\": \"train\", \"path\": \"datikz/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: diagram_image_to_text {\"split\": \"train\", \"path\": \"diagram_image_to_text/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: docvqa {\"split\": \"train\", \"path\": \"docvqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: dvqa {\"split\": \"train\", \"path\": \"dvqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: figureqa {\"split\": \"train\", \"path\": \"figureqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: finqa {\"split\": \"train\", \"path\": \"finqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: geomverse {\"split\": \"train\", \"path\": \"geomverse/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: hateful_memes {\"split\": \"train\", \"path\": \"hateful_memes/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: hitab {\"split\": \"train\", \"path\": \"hitab/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: iam {\"split\": \"train\", \"path\": \"iam/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: iconqa {\"split\": \"train\", \"path\": \"iconqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: infographic_vqa {\"split\": \"train\", \"path\": \"infographic_vqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: intergps {\"split\": \"train\", \"path\": \"intergps/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: localized_narratives {\"split\": \"train\", \"path\": \"localized_narratives/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: mapqa {\"split\": \"train\", \"path\": \"mapqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: mimic_cgd {\"split\": \"train\", \"path\": \"mimic_cgd/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: multihiertt {\"split\": \"train\", \"path\": \"multihiertt/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: nlvr2 {\"split\": \"train\", \"path\": \"nlvr2/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: ocrvqa {\"split\": \"train\", \"path\": \"ocrvqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: okvqa {\"split\": \"train\", \"path\": \"okvqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: plotqa {\"split\": \"train\", \"path\": \"plotqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: raven {\"split\": \"train\", \"path\": \"raven/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: rendered_text {\"split\": \"train\", \"path\": \"rendered_text/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: robut_sqa {\"split\": \"train\", \"path\": \"robut_sqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: robut_wikisql {\"split\": \"train\", \"path\": \"robut_wikisql/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: robut_wtq {\"split\": \"train\", \"path\": \"robut_wtq/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: scienceqa {\"split\": \"train\", \"path\": \"scienceqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: screen2words {\"split\": \"train\", \"path\": \"screen2words/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: spot_the_diff {\"split\": \"train\", \"path\": \"spot_the_diff/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: st_vqa {\"split\": \"train\", \"path\": \"st_vqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: tabmwp {\"split\": \"train\", \"path\": \"tabmwp/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: tallyqa {\"split\": \"train\", \"path\": \"tallyqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: tat_qa {\"split\": \"train\", \"path\": \"tat_qa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: textcaps {\"split\": \"train\", \"path\": \"textcaps/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: textvqa {\"split\": \"train\", \"path\": \"textvqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: tqa {\"split\": \"train\", \"path\": \"tqa/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: vistext {\"split\": \"train\", \"path\": \"vistext/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: visual7w {\"split\": \"train\", \"path\": \"visual7w/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: visualmrc {\"split\": \"train\", \"path\": \"visualmrc/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: vqarad {\"split\": \"train\", \"path\": \"vqarad/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: vqav2 {\"split\": \"train\", \"path\": \"vqav2/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: vsr {\"split\": \"train\", \"path\": \"vsr/train-*\"}" }, { "name": "configs", "value": "Name of the dataset subset: websight {\"split\": \"train\", \"path\": \"websight/train-*\"}" } ] }, "governance": { "owners": [ { "organization": { "name": "HuggingFaceM4", "url": "https://huggingface.co/HuggingFaceM4" } } ] }, "description": "\n\t\n\t\t\n\t\tDataset Card for The Cauldron\n\t\n\n\n\n\t\n\t\t\n\t\tDataset description\n\t\n\nThe Cauldron is part of the Idefics2 release.\nIt is a massive collection of 50 vision-language datasets (training sets only) that were used for the fine-tuning of the vision-language model Idefics2.\n\n\t\n\t\t\n\t\tLoad the dataset\n\t\n\nTo load the dataset, install the library datasets with pip install datasets. Then,\nfrom datasets import load_dataset\nds = load_dataset(\"HuggingFaceM4/the_cauldron\", \"ai2d\")\n\nto download and load the\u2026 See the full description on the dataset page: https://huggingface.co/datasets/HuggingFaceM4/the_cauldron." } ] }, { "type": "data", "bom-ref": "HuggingFaceM4/Docmatix-1623432e-1ae3-5888-a9c9-23e566c5a05a", "name": "HuggingFaceM4/Docmatix", "data": [ { "type": "dataset", "bom-ref": "HuggingFaceM4/Docmatix-1623432e-1ae3-5888-a9c9-23e566c5a05a", "name": "HuggingFaceM4/Docmatix", "contents": { "url": "https://huggingface.co/datasets/HuggingFaceM4/Docmatix", "properties": [ { "name": "task_categories", "value": "visual-question-answering" }, { "name": "language", "value": "en" }, { "name": "size_categories", "value": "1M