Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /main /en /tasks /idefics.html

rtrm

about 1 month ago

download

raw

67.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Image tasks with IDEFICS","local":"image-tasks-with-idefics","sections":[{"title":"Loading the model","local":"loading-the-model","sections":[{"title":"Quantized model","local":"quantized-model","sections":[],"depth":3}],"depth":2},{"title":"Image captioning","local":"image-captioning","sections":[],"depth":2},{"title":"Prompted image captioning","local":"prompted-image-captioning","sections":[],"depth":2},{"title":"Few-shot prompting","local":"few-shot-prompting","sections":[],"depth":2},{"title":"Visual question answering","local":"visual-question-answering","sections":[],"depth":2},{"title":"Image classification","local":"image-classification","sections":[],"depth":2},{"title":"Image-guided text generation","local":"image-guided-text-generation","sections":[],"depth":2},{"title":"Running inference in batch mode","local":"running-inference-in-batch-mode","sections":[],"depth":2},{"title":"IDEFICS instruct for conversational use","local":"idefics-instruct-for-conversational-use","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.0f2b7d5f.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.3d04d2c6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.026d2fdd.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/402.98199ca3.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/DocNotebookDropdown.5ea6cb78.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/globals.7f7f1b26.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Image tasks with IDEFICS","local":"image-tasks-with-idefics","sections":[{"title":"Loading the model","local":"loading-the-model","sections":[{"title":"Quantized model","local":"quantized-model","sections":[],"depth":3}],"depth":2},{"title":"Image captioning","local":"image-captioning","sections":[],"depth":2},{"title":"Prompted image captioning","local":"prompted-image-captioning","sections":[],"depth":2},{"title":"Few-shot prompting","local":"few-shot-prompting","sections":[],"depth":2},{"title":"Visual question answering","local":"visual-question-answering","sections":[],"depth":2},{"title":"Image classification","local":"image-classification","sections":[],"depth":2},{"title":"Image-guided text generation","local":"image-guided-text-generation","sections":[],"depth":2},{"title":"Running inference in batch mode","local":"running-inference-in-batch-mode","sections":[],"depth":2},{"title":"IDEFICS instruct for conversational use","local":"idefics-instruct-for-conversational-use","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="image-tasks-with-idefics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-tasks-with-idefics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image tasks with IDEFICS</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-11wnsr">While individual tasks can be tackled by fine-tuning specialized models, an alternative approach
	that has recently emerged and gained popularity is to use large models for a diverse set of tasks without fine-tuning.
	For instance, large language models can handle such NLP tasks as summarization, translation, classification, and more.
	This approach is no longer limited to a single modality, such as text, and in this guide, we will illustrate how you can
	solve image-text tasks with a large multimodal model called IDEFICS.</p> <p data-svelte-h="svelte-1nt7nk6"><a href="../model_doc/idefics">IDEFICS</a> is an open-access vision and language model based on <a href="https://huggingface.co/papers/2204.14198" rel="nofollow">Flamingo</a>,
	a state-of-the-art visual language model initially developed by DeepMind. The model accepts arbitrary sequences of image
	and text inputs and generates coherent text as output. It can answer questions about images, describe visual content,
	create stories grounded in multiple images, and so on. IDEFICS comes in two variants - <a href="https://huggingface.co/HuggingFaceM4/idefics-80b" rel="nofollow">80 billion parameters</a>
	and <a href="https://huggingface.co/HuggingFaceM4/idefics-9b" rel="nofollow">9 billion parameters</a>, both of which are available on the 🤗 Hub. For each variant, you can also find fine-tuned instructed
	versions of the model adapted for conversational use cases.</p> <p data-svelte-h="svelte-1k5g1sg">This model is exceptionally versatile and can be used for a wide range of image and multimodal tasks. However,
	being a large model means it requires significant computational resources and infrastructure. It is up to you to decide whether
	this approach suits your use case better than fine-tuning specialized models for each individual task.</p> <p data-svelte-h="svelte-fp1hdi">In this guide, you’ll learn how to:</p> <ul data-svelte-h="svelte-lzrpc8"><li><a href="#loading-the-model">Load IDEFICS</a> and <a href="#quantized-model">load the quantized version of the model</a></li> <li>Use IDEFICS for: <ul><li><a href="#image-captioning">Image captioning</a></li> <li><a href="#prompted-image-captioning">Prompted image captioning</a></li> <li><a href="#few-shot-prompting">Few-shot prompting</a></li> <li><a href="#visual-question-answering">Visual question answering</a></li> <li><a href="#image-classification">Image classification</a></li> <li><a href="#image-guided-text-generation">Image-guided text generation</a></li></ul></li> <li><a href="#running-inference-in-batch-mode">Run inference in batch mode</a></li> <li><a href="#idefics-instruct-for-conversational-use">Run IDEFICS instruct for conversational use</a></li></ul> <p data-svelte-h="svelte-qn4ey1">Before you begin, make sure you have all the necessary libraries installed.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install -q bitsandbytes sentencepiece accelerate transformers<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">To run the following examples with a non-quantized version of the model checkpoint you will need at least 20GB of GPU memory.</div> <h2 class="relative group"><a id="loading-the-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-the-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading the model</span></h2> <p data-svelte-h="svelte-pbsmkm">Let’s start by loading the model’s 9 billion parameters checkpoint:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"HuggingFaceM4/idefics-9b"</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qec1kb">Just like for other Transformers models, you need to load a processor and the model itself from the checkpoint.
	The IDEFICS processor wraps a <a href="/docs/transformers/main/en/model_doc/llama#transformers.LlamaTokenizer">LlamaTokenizer</a> and IDEFICS image processor into a single processor to take care of
	preparing text and image inputs for the model.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch

	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor

	<span class="hljs-meta">>>> </span>processor = AutoProcessor.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"auto"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10jwul3">Setting <code>device_map</code> to <code>"auto"</code> will automatically determine how to load and store the model weights in the most optimized
	manner given existing devices.</p> <h3 class="relative group"><a id="quantized-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantized-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantized model</span></h3> <p data-svelte-h="svelte-gb566o">If high-memory GPU availability is an issue, you can load the quantized version of the model. To load the model and the
	processor in 4bit precision, pass a <code>BitsAndBytesConfig</code> to the <code>from_pretrained</code> method and the model will be compressed
	on the fly while loading.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig

	<span class="hljs-meta">>>> </span>quantization_config = BitsAndBytesConfig(
	<span class="hljs-meta">... </span> load_in_4bit=<span class="hljs-literal">True</span>,
	<span class="hljs-meta">... </span> bnb_4bit_compute_dtype=torch.float16,
	<span class="hljs-meta">... </span>)

	<span class="hljs-meta">>>> </span>processor = AutoProcessor.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>model = IdeficsForVisionText2Text.from_pretrained(
	<span class="hljs-meta">... </span> checkpoint,
	<span class="hljs-meta">... </span> quantization_config=quantization_config,
	<span class="hljs-meta">... </span> device_map=<span class="hljs-string">"auto"</span>
	<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vovwl0">Now that you have the model loaded in one of the suggested ways, let’s move on to exploring tasks that you can use IDEFICS for.</p> <h2 class="relative group"><a id="image-captioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-captioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image captioning</span></h2> <p data-svelte-h="svelte-195xyco">Image captioning is the task of predicting a caption for a given image. A common application is to aid visually impaired
	people navigate through different situations, for instance, explore image content online.</p> <p data-svelte-h="svelte-syqfn9">To illustrate the task, get an image to be captioned, e.g.:</p> <div class="flex justify-center" data-svelte-h="svelte-t8y7db"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-im-captioning.jpg" alt="Image of a puppy in a flower bed"></div> <p data-svelte-h="svelte-knozkn">Photo by <a href="https://unsplash.com/@hendoo" rel="nofollow">Hendo Wang</a>.</p> <p data-svelte-h="svelte-10bztc1">IDEFICS accepts text and image prompts. However, to caption an image, you do not have to provide a text prompt to the
	model, only the preprocessed input image. Without a text prompt, the model will start generating text from the
	BOS (beginning-of-sequence) token thus creating a caption.</p> <p data-svelte-h="svelte-sk4o55">As image input to the model, you can use either an image object (<code>PIL.Image</code>) or a url from which the image can be retrieved.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompt = [
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80"</span>,
	<span class="hljs-meta">... </span>]

	<span class="hljs-meta">>>> </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
	A puppy <span class="hljs-keyword">in</span> a flower bed<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-uwc51d">It is a good idea to include the <code>bad_words_ids</code> in the call to <code>generate</code> to avoid errors arising when increasing
	the <code>max_new_tokens</code>: the model will want to generate a new <code><image></code> or <code><fake_token_around_image></code> token when there
	is no image being generated by the model.
	You can set it on-the-fly as in this guide, or store in the <code>GenerationConfig</code> as described in the <a href="../generation_strategies">Text generation strategies</a> guide.</p></div> <h2 class="relative group"><a id="prompted-image-captioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prompted-image-captioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prompted image captioning</span></h2> <p data-svelte-h="svelte-15l3yis">You can extend image captioning by providing a text prompt, which the model will continue given the image. Let’s take
	another image to illustrate:</p> <div class="flex justify-center" data-svelte-h="svelte-1ritb1k"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-prompted-im-captioning.jpg" alt="Image of the Eiffel Tower at night"></div> <p data-svelte-h="svelte-1uz68x8">Photo by <a href="https://unsplash.com/@dnevozhai" rel="nofollow">Denys Nevozhai</a>.</p> <p data-svelte-h="svelte-cncrxa">Textual and image prompts can be passed to the model’s processor as a single list to create appropriate inputs.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompt = [
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"This is an image of "</span>,
	<span class="hljs-meta">... </span>]

	<span class="hljs-meta">>>> </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
	This <span class="hljs-keyword">is</span> an image of the Eiffel Tower <span class="hljs-keyword">in</span> Paris, France.<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="few-shot-prompting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#few-shot-prompting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Few-shot prompting</span></h2> <p data-svelte-h="svelte-fytdtn">While IDEFICS demonstrates great zero-shot results, your task may require a certain format of the caption, or come with
	other restrictions or requirements that increase task’s complexity. Few-shot prompting can be used to enable in-context learning.
	By providing examples in the prompt, you can steer the model to generate results that mimic the format of given examples.</p> <p data-svelte-h="svelte-148v07a">Let’s use the previous image of the Eiffel Tower as an example for the model and build a prompt that demonstrates to the model
	that in addition to learning what the object in an image is, we would also like to get some interesting information about it.
	Then, let’s see, if we can get the same response format for an image of the Statue of Liberty:</p> <div class="flex justify-center" data-svelte-h="svelte-gin1vp"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg" alt="Image of the Statue of Liberty"></div> <p data-svelte-h="svelte-s7b8dy">Photo by <a href="https://unsplash.com/@jmayobres" rel="nofollow">Juan Mayobre</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompt = [<span class="hljs-string">"User:"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"User:"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3387&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"Describe this image.\nAssistant:"</span>
	<span class="hljs-meta">... </span> ]

	<span class="hljs-meta">>>> </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">30</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
	User: Describe this image.
	Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower <span class="hljs-keyword">is</span> the same height <span class="hljs-keyword">as</span> an <span class="hljs-number">81</span>-storey building.
	User: Describe this image.
	Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty <span class="hljs-keyword">is</span> <span class="hljs-number">151</span> feet tall.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mpygyg">Notice that just from a single example (i.e., 1-shot) the model has learned how to perform the task. For more complex tasks,
	feel free to experiment with a larger number of examples (e.g., 3-shot, 5-shot, etc.).</p> <h2 class="relative group"><a id="visual-question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#visual-question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Visual question answering</span></h2> <p data-svelte-h="svelte-cnfg69">Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. Similar to image
	captioning it can be used in accessibility applications, but also in education (reasoning about visual materials), customer
	service (questions about products based on images), and image retrieval.</p> <p data-svelte-h="svelte-nptt59">Let’s get a new image for this task:</p> <div class="flex justify-center" data-svelte-h="svelte-1j2xr8e"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"></div> <p data-svelte-h="svelte-1aj82zb">Photo by <a href="https://unsplash.com/@jarritos" rel="nofollow">Jarritos Mexican Soda</a>.</p> <p data-svelte-h="svelte-tdtrto">You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompt = [
	<span class="hljs-meta">... </span> <span class="hljs-string">"Instruction: Provide an answer to the question. Use the image to answer.\n"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"Question: Where are these people and what's the weather like? Answer:"</span>
	<span class="hljs-meta">... </span>]

	<span class="hljs-meta">>>> </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">20</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
	Instruction: Provide an answer to the question. Use the image to answer.
	Question: Where are these people <span class="hljs-keyword">and</span> what<span class="hljs-string">'s the weather like? Answer: They'</span>re <span class="hljs-keyword">in</span> a park <span class="hljs-keyword">in</span> New York City, <span class="hljs-keyword">and</span> it<span class="hljs-string">'s a beautiful day.</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="image-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image classification</span></h2> <p data-svelte-h="svelte-a4cfdv">IDEFICS is capable of classifying images into different categories without being explicitly trained on data containing
	labeled examples from those specific categories. Given a list of categories and using its image and text understanding
	capabilities, the model can infer which category the image likely belongs to.</p> <p data-svelte-h="svelte-1xbkffx">Say, we have this image of a vegetable stand:</p> <div class="flex justify-center" data-svelte-h="svelte-g02ga3"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-classification.jpg" alt="Image of a vegetable stand"></div> <p data-svelte-h="svelte-17q4ltv">Photo by <a href="https://unsplash.com/@peterwendt" rel="nofollow">Peter Wendt</a>.</p> <p data-svelte-h="svelte-13lz1gw">We can instruct the model to classify the image into one of the categories that we have:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>categories = [<span class="hljs-string">'animals'</span>,<span class="hljs-string">'vegetables'</span>, <span class="hljs-string">'city landscape'</span>, <span class="hljs-string">'cars'</span>, <span class="hljs-string">'office'</span>]
	<span class="hljs-meta">>>> </span>prompt = [<span class="hljs-string">f"Instruction: Classify the following image into a single category from the following list: <span class="hljs-subst">{categories}</span>.\n"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"Category: "</span>
	<span class="hljs-meta">... </span>]

	<span class="hljs-meta">>>> </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">6</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
	Instruction: Classify the following image into a single category <span class="hljs-keyword">from</span> the following <span class="hljs-built_in">list</span>: [<span class="hljs-string">'animals'</span>, <span class="hljs-string">'vegetables'</span>, <span class="hljs-string">'city landscape'</span>, <span class="hljs-string">'cars'</span>, <span class="hljs-string">'office'</span>].
	Category: Vegetables<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-571z8p">In the example above we instruct the model to classify the image into a single category, however, you can also prompt the model to do rank classification.</p> <h2 class="relative group"><a id="image-guided-text-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-guided-text-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image-guided text generation</span></h2> <p data-svelte-h="svelte-1cpi6ml">For more creative applications, you can use image-guided text generation to generate text based on an image. This can be
	useful to create descriptions of products, ads, descriptions of a scene, etc.</p> <p data-svelte-h="svelte-bxfm2h">Let’s prompt IDEFICS to write a story based on a simple image of a red door:</p> <div class="flex justify-center" data-svelte-h="svelte-1mf93u3"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-story-generation.jpg" alt="Image of a red door with a pumpkin on the steps"></div> <p data-svelte-h="svelte-75pbgi">Photo by <a href="https://unsplash.com/@devonshiremedia" rel="nofollow">Craig Tidball</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompt = [<span class="hljs-string">"Instruction: Use the image to write a story. \n"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=2203&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"Story: \n"</span>]

	<span class="hljs-meta">>>> </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, num_beams=<span class="hljs-number">2</span>, max_new_tokens=<span class="hljs-number">200</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
	Instruction: Use the image to write a story.
	Story:
	Once upon a time, there was a little girl who lived <span class="hljs-keyword">in</span> a house <span class="hljs-keyword">with</span> a red door. She loved her red door. It was the prettiest door <span class="hljs-keyword">in</span> the whole world.

	One day, the little girl was playing <span class="hljs-keyword">in</span> her yard when she noticed a man standing on her doorstep. He was wearing a long black coat <span class="hljs-keyword">and</span> a top hat.

	The little girl ran inside <span class="hljs-keyword">and</span> told her mother about the man.

	Her mother said, “Don’t worry, honey. He’s just a friendly ghost.”

	The little girl wasn’t sure <span class="hljs-keyword">if</span> she believed her mother, but she went outside anyway.

	When she got to the door, the man was gone.

	The <span class="hljs-built_in">next</span> day, the little girl was playing <span class="hljs-keyword">in</span> her yard again when she noticed the man standing on her doorstep.

	He was wearing a long black coat <span class="hljs-keyword">and</span> a top hat.

	The little girl ran<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-eumo01">Looks like IDEFICS noticed the pumpkin on the doorstep and went with a spooky Halloween story about a ghost.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-79vr3a">For longer outputs like this, you will greatly benefit from tweaking the text generation strategy. This can help
	you significantly improve the quality of the generated output. Check out <a href="../generation_strategies">Text generation strategies</a>
	to learn more.</p></div> <h2 class="relative group"><a id="running-inference-in-batch-mode" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-inference-in-batch-mode"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running inference in batch mode</span></h2> <p data-svelte-h="svelte-2k4kpw">All of the earlier sections illustrated IDEFICS for a single example. In a very similar fashion, you can run inference
	for a batch of examples by passing a list of prompts:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>prompts = [
	<span class="hljs-meta">... </span> [ <span class="hljs-string">"https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"This is an image of "</span>,
	<span class="hljs-meta">... </span> ],
	<span class="hljs-meta">... </span> [ <span class="hljs-string">"https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"This is an image of "</span>,
	<span class="hljs-meta">... </span> ],
	<span class="hljs-meta">... </span> [ <span class="hljs-string">"https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"This is an image of "</span>,
	<span class="hljs-meta">... </span> ],
	<span class="hljs-meta">... </span>]

	<span class="hljs-meta">>>> </span>inputs = processor(prompts, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> i,t <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(generated_text):
	<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{i}</span>:\n<span class="hljs-subst">{t}</span>\n"</span>)
	<span class="hljs-number">0</span>:
	This <span class="hljs-keyword">is</span> an image of the Eiffel Tower <span class="hljs-keyword">in</span> Paris, France.

	<span class="hljs-number">1</span>:
	This <span class="hljs-keyword">is</span> an image of a couple on a picnic blanket.

	<span class="hljs-number">2</span>:
	This <span class="hljs-keyword">is</span> an image of a vegetable stand.<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="idefics-instruct-for-conversational-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#idefics-instruct-for-conversational-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>IDEFICS instruct for conversational use</span></h2> <p data-svelte-h="svelte-ay5071">For conversational use cases, you can find fine-tuned instructed versions of the model on the 🤗 Hub:
	<code>HuggingFaceM4/idefics-80b-instruct</code> and <code>HuggingFaceM4/idefics-9b-instruct</code>.</p> <p data-svelte-h="svelte-8tbmhu">These checkpoints are the result of fine-tuning the respective base models on a mixture of supervised and instruction
	fine-tuning datasets, which boosts the downstream performance while making the models more usable in conversational settings.</p> <p data-svelte-h="svelte-ccrnle">The use and prompting for the conversational use is very similar to using the base models:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor

	<span class="hljs-meta">>>> </span>device = <span class="hljs-string">"cuda"</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">"cpu"</span>

	<span class="hljs-meta">>>> </span>checkpoint = <span class="hljs-string">"HuggingFaceM4/idefics-9b-instruct"</span>
	<span class="hljs-meta">>>> </span>model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
	<span class="hljs-meta">>>> </span>processor = AutoProcessor.from_pretrained(checkpoint)

	<span class="hljs-meta">>>> </span>prompts = [
	<span class="hljs-meta">... </span> [
	<span class="hljs-meta">... </span> <span class="hljs-string">"User: What is in this image?"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"<end_of_utterance>"</span>,

	<span class="hljs-meta">... </span> <span class="hljs-string">"\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>"</span>,

	<span class="hljs-meta">... </span> <span class="hljs-string">"\nUser:"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052"</span>,
	<span class="hljs-meta">... </span> <span class="hljs-string">"And who is that?<end_of_utterance>"</span>,

	<span class="hljs-meta">... </span> <span class="hljs-string">"\nAssistant:"</span>,
	<span class="hljs-meta">... </span> ],
	<span class="hljs-meta">... </span>]

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># --batched mode</span>
	<span class="hljs-meta">>>> </span>inputs = processor(prompts, add_end_of_utterance_token=<span class="hljs-literal">False</span>, return_tensors=<span class="hljs-string">"pt"</span>).to(device)
	<span class="hljs-meta">>>> </span><span class="hljs-comment"># --single sample mode</span>
	<span class="hljs-meta">>>> </span><span class="hljs-comment"># inputs = processor(prompts[0], return_tensors="pt").to(device)</span>

	<span class="hljs-meta">>>> </span><span class="hljs-comment"># Generation args</span>
	<span class="hljs-meta">>>> </span>exit_condition = processor.tokenizer(<span class="hljs-string">"<end_of_utterance>"</span>, add_special_tokens=<span class="hljs-literal">False</span>).input_ids
	<span class="hljs-meta">>>> </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">"<image>"</span>, <span class="hljs-string">"<fake_token_around_image>"</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids

	<span class="hljs-meta">>>> </span>generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=<span class="hljs-number">100</span>)
	<span class="hljs-meta">>>> </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> i, t <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(generated_text):
	<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{i}</span>:\n<span class="hljs-subst">{t}</span>\n"</span>)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/tasks/idefics.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1xexzbk = {
	assets: "/docs/transformers/main/en",
	base: "/docs/transformers/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js"),
	import("/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 402],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 67.8 kB
Xet hash:: f9ba649c1d3b78150e3e5d3e29b6f371391967666d2dc851350f9672ab2cdd64

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.