Buckets:

hf-doc-build/doc / transformers /main /en /chat_template_multimodal.html
rtrm's picture
download
raw
50.2 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Multimodal Chat Templates for Vision and Audio LLMs&quot;,&quot;local&quot;:&quot;multimodal-chat-templates-for-vision-and-audio-llms&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Image inputs&quot;,&quot;local&quot;:&quot;image-inputs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Image paths or URLs&quot;,&quot;local&quot;:&quot;image-paths-or-urls&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Video inputs&quot;,&quot;local&quot;:&quot;video-inputs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Sampling with fixed number of frames&quot;,&quot;local&quot;:&quot;sampling-with-fixed-number-of-frames&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Sampling with FPS&quot;,&quot;local&quot;:&quot;sampling-with-fps&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Custom Frame Sampling with a Function&quot;,&quot;local&quot;:&quot;custom-frame-sampling-with-a-function&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;List of image frames as video&quot;,&quot;local&quot;:&quot;list-of-image-frames-as-video&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Multimodal conversational pipeline&quot;,&quot;local&quot;:&quot;multimodal-conversational-pipeline&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Best Practices for Multimodal Template Configuration&quot;,&quot;local&quot;:&quot;best-practices-for-multimodal-template-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.8cba93b8.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.bdbef820.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.6587ca2b.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.8a885b74.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.2c06013a.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.00a1188c.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.33f81d56.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.876f3f60.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/13.c42adde1.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/Tip.34194030.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.362b34a4.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.a9246e21.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Multimodal Chat Templates for Vision and Audio LLMs&quot;,&quot;local&quot;:&quot;multimodal-chat-templates-for-vision-and-audio-llms&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Image inputs&quot;,&quot;local&quot;:&quot;image-inputs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Image paths or URLs&quot;,&quot;local&quot;:&quot;image-paths-or-urls&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Video inputs&quot;,&quot;local&quot;:&quot;video-inputs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Sampling with fixed number of frames&quot;,&quot;local&quot;:&quot;sampling-with-fixed-number-of-frames&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Sampling with FPS&quot;,&quot;local&quot;:&quot;sampling-with-fps&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Custom Frame Sampling with a Function&quot;,&quot;local&quot;:&quot;custom-frame-sampling-with-a-function&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;List of image frames as video&quot;,&quot;local&quot;:&quot;list-of-image-frames-as-video&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Multimodal conversational pipeline&quot;,&quot;local&quot;:&quot;multimodal-conversational-pipeline&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Best Practices for Multimodal Template Configuration&quot;,&quot;local&quot;:&quot;best-practices-for-multimodal-template-configuration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="multimodal-chat-templates-for-vision-and-audio-llms" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multimodal-chat-templates-for-vision-and-audio-llms"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multimodal Chat Templates for Vision and Audio LLMs</span></h1> <p data-svelte-h="svelte-1vh73bc">In this section, we’ll explore how to use chat templates with multimodal models, enabling your templates to handle a variety of inputs such as text, images, and audio. Multimodal models provide richer, more interactive experiences, and understanding how to effectively combine these inputs within your templates is key. We’ll walk through how to work with different modalities, configure your templates for optimal performance, and tackle common challenges along the way.</p> <p data-svelte-h="svelte-w0zoss">Just like with text-only LLMs, multimodal models expect a chat with <strong>messages</strong>, each of which includes a <strong>role</strong> and <strong>content</strong>. However, for multimodal models, chat templates are a part of the <a href="./main_cllasses/processors">Processor</a> class. Let’s see how we can format our prompts when there are images or videos in the input along with text.</p> <h2 class="relative group"><a id="image-inputs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-inputs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image inputs</span></h2> <p data-svelte-h="svelte-97p3uh">For models such as <a href="https://huggingface.co/llava-hf" rel="nofollow">LLaVA</a> the prompts can be formatted as below. Notice that the only difference from text-only models is that we need to also pass a placeholder for input images. To accommodate for extra modalities, each <strong>content</strong> is a list containing either a text or an image <strong>type</strong>.</p> <p data-svelte-h="svelte-1ljzup">Let’s make this concrete with a quick example using the <code>llava-hf/llava-onevision-qwen2-0.5b-ov-hf</code> model:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor, LlavaOnevisionForConditionalGeneration
model_id = <span class="hljs-string">&quot;llava-hf/llava-onevision-qwen2-0.5b-ov-hf&quot;</span>
processor = AutoProcessor.from_pretrained(model_id)
messages = [
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;You are a friendly chatbot who always responds in the style of a pirate&quot;</span>}],
},
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;image&quot;</span>},
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;What are these?&quot;</span>},
],
},
]
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=<span class="hljs-literal">True</span>, tokenize=<span class="hljs-literal">False</span>)
<span class="hljs-built_in">print</span>(formatted_prompt)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rg1ihm">This yields a string in LLaVA’s expected input format with many <code>&lt;image&gt;</code> tokens prepended before the text.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&#x27;&lt;|im_start|&gt;system
&lt;|im_start|&gt;system
You are a friendly chatbot who always responds in the style of a pirate&lt;|im_end|&gt;&lt;|im_start|&gt;user &lt;image&gt;
What are these?&lt;|im_end|&gt;<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="image-paths-or-urls" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-paths-or-urls"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image paths or URLs</span></h3> <p data-svelte-h="svelte-1mjcd7j">To incorporate images into your chat templates, you can pass them as file paths or URLs. This method automatically loads the image, processes it, and prepares the necessary pixel values to create ready-to-use inputs for the model. This approach simplifies the integration of images, enabling seamless multimodal functionality.</p> <p data-svelte-h="svelte-1fc6o5w">Let’s see how it works with an example using the same model as above. This time we’ll indicate an image URL with <code>&quot;url&quot;</code> key in the message’s <strong>content</strong> and ask the chat template to <code>tokenize</code> and <code>return_dict</code>. Currently, “base64”, “url”, and “path” are supported image sources.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor, LlavaOnevisionForConditionalGeneration
model_id = <span class="hljs-string">&quot;llava-hf/llava-onevision-qwen2-0.5b-ov-hf&quot;</span>
model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
messages = [
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;You are a friendly chatbot who always responds in the style of a pirate&quot;</span>}],
},
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;image&quot;</span>, <span class="hljs-string">&quot;url&quot;</span>: <span class="hljs-string">&quot;http://images.cocodataset.org/val2017/000000039769.jpg&quot;</span>},
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;What are these?&quot;</span>},
],
},
]
processed_chat = processor.apply_chat_template(messages, add_generation_prompt=<span class="hljs-literal">True</span>, tokenize=<span class="hljs-literal">True</span>, return_dict=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
<span class="hljs-built_in">print</span>(processed_chat.keys())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dbnusc">This yields a dictionary with inputs processed and ready to be further passed into <a href="/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate">generate()</a> to generate text.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->dict_keys([&quot;input_ids&quot;, &quot;attention_mask&quot;, &quot;pixel_values&quot;, &quot;image_sizes&quot;])<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="video-inputs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#video-inputs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Video inputs</span></h2> <p data-svelte-h="svelte-8qhe2y">Some vision models support videos as inputs as well as images. The message format is very similar to the image-only models with tiny differences to handle loading videos from a URL. We can continue using the same model as before since it supports videos.</p> <h3 class="relative group"><a id="sampling-with-fixed-number-of-frames" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sampling-with-fixed-number-of-frames"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sampling with fixed number of frames</span></h3> <p data-svelte-h="svelte-h7pp8l">Here’s an example of how to set up a conversation with video inputs. Notice the extra <code>kwargs</code> passed to <code>processor.apply_chat_template()</code>. The key parameter here is <code>num_frames</code>, which controls how many frames to sample uniformly from the video. Each model checkpoint has a maximum frame count it was trained with, and exceeding this limit can significantly impact generation quality. So, it’s important to choose a frame count that fits both the model’s capacity and your computational resources. If you don’t specify <code>num_frames</code>, the entire video will be loaded without any frame sampling.</p> <p data-svelte-h="svelte-qmm6k4">You also have the option to choose a specific framework to load the video, depending on your preferences or needs. Currently, we support <code>decord</code>, <code>pyav</code> (the default), <code>opencv</code>, and <code>torchvision</code>. For this example, we’ll use <code>decord</code>, as it’s a bit faster than <code>pyav</code>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-15dpt1t">Note that if you are trying to load a video from URL, you can decode the video only with <code>pyav</code> or <code>decord</code> as backend.</p></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor, LlavaOnevisionForConditionalGeneration
model_id = <span class="hljs-string">&quot;llava-hf/llava-onevision-qwen2-0.5b-ov-hf&quot;</span>
model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)
messages = [
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;You are a friendly chatbot who always responds in the style of a pirate&quot;</span>}],
},
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;video&quot;</span>, <span class="hljs-string">&quot;url&quot;</span>: <span class="hljs-string">&quot;https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4&quot;</span>},
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;What do you see in this video?&quot;</span>},
],
},
]
processed_chat = processor.apply_chat_template(
messages,
add_generation_prompt=<span class="hljs-literal">True</span>,
tokenize=<span class="hljs-literal">True</span>,
return_dict=<span class="hljs-literal">True</span>,
return_tensors=<span class="hljs-string">&quot;pt&quot;</span>,
num_frames=<span class="hljs-number">32</span>,
video_load_backend=<span class="hljs-string">&quot;decord&quot;</span>,
)
<span class="hljs-built_in">print</span>(processed_chat.keys())<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="sampling-with-fps" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sampling-with-fps"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sampling with FPS</span></h3> <p data-svelte-h="svelte-17386uu">When working with long videos, you might want to sample more frames for better representation. Instead of a fixed number of frames, you can specify <code>video_fps</code>, which determines how many frames per second to extract. For example, if a video is <strong>10 seconds long</strong> and you set <code>video_fps=2</code>, the model will sample <strong>20 frames</strong> (2 per second, uniformly spaced).</p> <p data-svelte-h="svelte-1uvxkpu">Using the above model, we need to apply chat template as follows to sample 2 frames per second.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->processed_chat = processor.apply_chat_template(
messages,
add_generation_prompt=<span class="hljs-literal">True</span>,
tokenize=<span class="hljs-literal">True</span>,
return_dict=<span class="hljs-literal">True</span>,
video_fps=<span class="hljs-number">32</span>,
video_load_backend=<span class="hljs-string">&quot;decord&quot;</span>,
)
<span class="hljs-built_in">print</span>(processed_chat.keys())<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="custom-frame-sampling-with-a-function" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#custom-frame-sampling-with-a-function"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Custom Frame Sampling with a Function</span></h3> <p data-svelte-h="svelte-10f7ad4">Not all models sample frames <strong>uniformly</strong> — some require more complex logic to determine which frames to use. If your model follows a different sampling strategy, you can <strong>customize</strong> frame selection by providing a function:</p> <p data-svelte-h="svelte-15ee1kk">🔹 Use the <code>sample_indices_fn</code> argument to pass a <strong>callable function</strong> for sampling.<br>
🔹 If provided, this function <strong>overrides</strong> standard <code>num_frames</code> and <code>fps</code> methods.<br>
🔹 It receives all the arguments passed to <code>load_video</code> and must return <strong>valid frame indices</strong> to sample.</p> <p data-svelte-h="svelte-1id9cat">You should use <code>sample_indices_fn</code> when:</p> <ul data-svelte-h="svelte-1i1tmw4"><li>If you need a custom sampling strategy (e.g., <strong>adaptive frame selection</strong> instead of uniform sampling).</li> <li>If your model prioritizes <strong>key moments</strong> in a video rather than evenly spaced frames.</li></ul> <p data-svelte-h="svelte-ntx8uj">Here’s an example of how to implement it:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->
<span class="hljs-keyword">def</span> <span class="hljs-title function_">sample_indices_fn</span>(<span class="hljs-params">metadata, **kwargs</span>):
<span class="hljs-comment"># samples only the first and the second frame</span>
<span class="hljs-keyword">return</span> [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>]
processed_chat = processor.apply_chat_template(
messages,
add_generation_prompt=<span class="hljs-literal">True</span>,
tokenize=<span class="hljs-literal">True</span>,
return_dict=<span class="hljs-literal">True</span>,
sample_indices_fn=sample_indices_fn,
video_load_backend=<span class="hljs-string">&quot;decord&quot;</span>,
)
<span class="hljs-built_in">print</span>(processed_chat.keys())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cbn9yp">By using <code>sample_indices_fn</code>, you gain <strong>full control</strong> over frame selection, making your model <strong>more adaptable</strong> to different video scenarios. 🚀</p> <h3 class="relative group"><a id="list-of-image-frames-as-video" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#list-of-image-frames-as-video"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>List of image frames as video</span></h3> <p data-svelte-h="svelte-trfw6a">Sometimes, instead of having a full video file, you might only have a set of sampled frames stored as images.</p> <p data-svelte-h="svelte-1i91am4">You can pass a list of image file paths, and the processor will automatically concatenate them into a video. Just make sure that all images have the same size, as they are assumed to be from the same video.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->frames_paths = [<span class="hljs-string">&quot;/path/to/frame0.png&quot;</span>, <span class="hljs-string">&quot;/path/to/frame5.png&quot;</span>, <span class="hljs-string">&quot;/path/to/frame10.png&quot;</span>]
messages = [
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;You are a friendly chatbot who always responds in the style of a pirate&quot;</span>}],
},
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;video&quot;</span>, <span class="hljs-string">&quot;path&quot;</span>: frames_paths},
{<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>, <span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;What do you see in this video?&quot;</span>},
],
},
]
processed_chat = processor.apply_chat_template(
messages,
add_generation_prompt=<span class="hljs-literal">True</span>,
tokenize=<span class="hljs-literal">True</span>,
return_dict=<span class="hljs-literal">True</span>,
)
<span class="hljs-built_in">print</span>(processed_chat.keys())<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="multimodal-conversational-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multimodal-conversational-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multimodal conversational pipeline</span></h2> <p data-svelte-h="svelte-1pzpfjq"><a href="/docs/transformers/main/en/main_classes/pipelines#transformers.ImageTextToTextPipeline">ImageTextToTextPipeline</a> currently accepts images as inputs but we are planning to add support for video inputs in the future. The pipeline supports chat inputs in the same format as we have seen above. Apart from that, the pipeline will accept chats in OpenAI format. This format is supported exclusively within the pipeline to make inference easier and more accessible.</p> <p data-svelte-h="svelte-tgl7fl">Here is how the OpenAI conversation format looks:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->messages = [
{
<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>,
<span class="hljs-string">&quot;content&quot;</span>: [
{
<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;text&quot;</span>,
<span class="hljs-string">&quot;text&quot;</span>: <span class="hljs-string">&quot;What is in this image?&quot;</span>,
},
{
<span class="hljs-string">&quot;type&quot;</span>: <span class="hljs-string">&quot;image_url&quot;</span>,
<span class="hljs-string">&quot;image_url&quot;</span>: {<span class="hljs-string">&quot;url&quot;</span>: <span class="hljs-string">f&quot;http://images.cocodataset.org/val2017/000000039769.jpg&quot;</span>},
},
],
}
]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="best-practices-for-multimodal-template-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#best-practices-for-multimodal-template-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Best Practices for Multimodal Template Configuration</span></h2> <p data-svelte-h="svelte-kr110e">To add a custom chat template for your multimodal LLM, simply create your template using <a href="https://jinja.palletsprojects.com/en/3.1.x/templates/" rel="nofollow">Jinja</a> and set it with <code>processor.chat_template</code>. If you’re new to writing chat templates or need some tips, check out our <a href="./chat_template_advanced">tutorial here</a> for helpful guidance.</p> <p data-svelte-h="svelte-11llr7p">In some cases, you may want your template to handle a <strong>list of content</strong> from multiple modalities, while still supporting a plain string for text-only inference. Here’s an example of how you can achieve that, using the <a href="https://huggingface.co/collections/meta-llama/metas-llama-32-multimodal-models-675bfd70e574a62dd0e4059b" rel="nofollow">Llama-Vision</a> chat template.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">for</span></span> message <span class="hljs-keyword">in</span> messages %}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">if</span></span> loop.index0 == 0 %}</span><span class="hljs-template-variable">{{ bos_token }}</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">endif</span></span> %}</span><span class="language-xml">
</span><span class="hljs-template-variable">{{ &#x27;&lt;|start_header_id|&gt;&#x27; + message[&#x27;role&#x27;] + &#x27;&lt;|end_header_id|&gt;\n\n&#x27; }}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">if</span></span> message[&#x27;content&#x27;] is string %}</span><span class="language-xml">
</span><span class="hljs-template-variable">{{ message[&#x27;content&#x27;] }}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">else</span></span> %}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">for</span></span> content <span class="hljs-keyword">in</span> message[&#x27;content&#x27;] %}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">if</span></span> content[&#x27;type&#x27;] == &#x27;image&#x27; %}</span><span class="language-xml">
</span><span class="hljs-template-variable">{{ &#x27;&lt;|image|&gt;&#x27; }}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">elif</span></span> content[&#x27;type&#x27;] == &#x27;text&#x27; %}</span><span class="language-xml">
</span><span class="hljs-template-variable">{{ content[&#x27;text&#x27;] }}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">endif</span></span> %}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">endfor</span></span> %}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">endif</span></span> %}</span><span class="language-xml">
</span><span class="hljs-template-variable">{{ &#x27;&lt;|eot_id|&gt;&#x27; }}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">endfor</span></span> %}</span><span class="language-xml">
</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">if</span></span> add_generation_prompt %}</span><span class="hljs-template-variable">{{ &#x27;&lt;|start_header_id|&gt;assistant&lt;|end_header_id|&gt;\n\n&#x27; }}</span><span class="hljs-template-tag">{% <span class="hljs-name"><span class="hljs-name">endif</span></span> %}</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/chat_template_multimodal.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1yq8tgy = {
assets: "/docs/transformers/main/en",
base: "/docs/transformers/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/en/_app/immutable/entry/start.8cba93b8.js"),
import("/docs/transformers/main/en/_app/immutable/entry/app.00a1188c.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 13],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
50.2 kB
·
Xet hash:
bdb23fad4d0d93a8e04a689f3637b511c89519b6ee45d6fb7f8e154adc7ae139

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.