Buckets:

rtrm's picture
download
raw
70.6 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Image tasks with IDEFICS&quot;,&quot;local&quot;:&quot;image-tasks-with-idefics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Loading the model&quot;,&quot;local&quot;:&quot;loading-the-model&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Quantized model&quot;,&quot;local&quot;:&quot;quantized-model&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image captioning&quot;,&quot;local&quot;:&quot;image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Prompted image captioning&quot;,&quot;local&quot;:&quot;prompted-image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Few-shot prompting&quot;,&quot;local&quot;:&quot;few-shot-prompting&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Visual question answering&quot;,&quot;local&quot;:&quot;visual-question-answering&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image classification&quot;,&quot;local&quot;:&quot;image-classification&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image-guided text generation&quot;,&quot;local&quot;:&quot;image-guided-text-generation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Running inference in batch mode&quot;,&quot;local&quot;:&quot;running-inference-in-batch-mode&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;IDEFICS instruct for conversational use&quot;,&quot;local&quot;:&quot;idefics-instruct-for-conversational-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/ja/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/entry/start.1486e459.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/scheduler.9bc65507.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/singletons.eee55cbf.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/index.3b203c72.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/paths.59da1547.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/entry/app.d9ae818f.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/index.707bf1b6.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/nodes/0.c06aa070.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/nodes/138.6e6982e6.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/Tip.c2ecdbf4.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/CodeBlock.54a9f38d.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/DocNotebookDropdown.41f65cb5.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/EditOnGithub.922df6ba.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Image tasks with IDEFICS&quot;,&quot;local&quot;:&quot;image-tasks-with-idefics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Loading the model&quot;,&quot;local&quot;:&quot;loading-the-model&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Quantized model&quot;,&quot;local&quot;:&quot;quantized-model&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image captioning&quot;,&quot;local&quot;:&quot;image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Prompted image captioning&quot;,&quot;local&quot;:&quot;prompted-image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Few-shot prompting&quot;,&quot;local&quot;:&quot;few-shot-prompting&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Visual question answering&quot;,&quot;local&quot;:&quot;visual-question-answering&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image classification&quot;,&quot;local&quot;:&quot;image-classification&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Image-guided text generation&quot;,&quot;local&quot;:&quot;image-guided-text-generation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Running inference in batch mode&quot;,&quot;local&quot;:&quot;running-inference-in-batch-mode&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;IDEFICS instruct for conversational use&quot;,&quot;local&quot;:&quot;idefics-instruct-for-conversational-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="image-tasks-with-idefics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-tasks-with-idefics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image tasks with IDEFICS</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-idfzro">個別のタスクは特殊なモデルを微調整することで対処できますが、別のアプローチも可能です。
最近登場して人気を博しているのは、微調整を行わずにさまざまなタスクに大規模なモデルを使用することです。
たとえば、大規模な言語モデルは、要約、翻訳、分類などの NLP タスクを処理できます。
このアプローチは、テキストなどの単一のモダリティに限定されなくなりました。このガイドでは、次のような方法を説明します。
IDEFICS と呼ばれる大規模なマルチモーダル モデルを使用して、画像とテキストのタスクを解決します。</p> <p data-svelte-h="svelte-48v8i5"><a href="../model_doc/idefics">IDEFICS</a> は、<a href="https://huggingface.co/papers/2204.14198" rel="nofollow">Flamingo</a> に基づくオープンアクセスのビジョンおよび言語モデルです。
DeepMind によって最初に開発された最先端の視覚言語モデル。モデルは任意の画像シーケンスを受け入れます
テキストを入力し、出力として一貫したテキストを生成します。画像に関する質問に答えたり、視覚的なコンテンツについて説明したり、
複数のイメージに基づいたストーリーを作成するなど。 IDEFICS には 2 つのバリエーションがあります - <a href="https://huggingface.co/HuggingFaceM4/idefics-80b" rel="nofollow">800 億パラメータ</a>
および <a href="https://huggingface.co/HuggingFaceM4/idefics-9b" rel="nofollow">90 億のパラメータ</a>、どちらも 🤗 Hub で入手できます。各バリエーションについて、細かく調整された指示も見つけることができます。
会話のユースケースに適応したモデルのバージョン。</p> <p data-svelte-h="svelte-1nff2jp">このモデルは非常に多用途で、幅広い画像タスクやマルチモーダル タスクに使用できます。しかし、
大規模なモデルであるということは、大量の計算リソースとインフラストラクチャが必要であることを意味します。それはあなた次第です
このアプローチは、個別のタスクごとに特化したモデルを微調整するよりも、ユースケースに適しています。</p> <p data-svelte-h="svelte-sw406c">このガイドでは、次の方法を学習します。</p> <ul data-svelte-h="svelte-s4nl0h"><li><a href="#loading-the-model">IDEFICS をロード</a> および <a href="#quantized-model">モデルの量子化バージョンをロード</a></li> <li>IDEFICS を次の目的で使用します。<ul><li><a href="#image-captioning">画像キャプション</a></li> <li><a href="#prompted-image-captioning">プロンプト画像キャプション</a></li> <li><a href="#few-shot-prompting">Few-shot プロンプト</a></li> <li><a href="#visual-question-answering">ビジュアル質問回答</a></li> <li><a href="#image-classification">画像分類</a></li> <li><a href="#image-guided-text-generation">画像ガイド付きテキスト生成</a></li></ul></li> <li><a href="#running-inference-in-batch-mode">バッチモードで推論を実行する</a></li> <li><a href="#idefics-instruct-for-conversational-use">会話用に IDEFICS 命令を実行</a></li></ul> <p data-svelte-h="svelte-1lya3k8">始める前に、必要なライブラリがすべてインストールされていることを確認してください。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install -q bitsandbytes sentencepiece accelerate transformers<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">量子化されていないバージョンのモデル チェックポイントを使用して次の例を実行するには、少なくとも 20GB の GPU メモリが必要です。</div> <h2 class="relative group"><a id="loading-the-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-the-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading the model</span></h2> <p data-svelte-h="svelte-1uo2yt4">まずはモデルの 90 億個のパラメーターのチェックポイントをロードしましょう。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>checkpoint = <span class="hljs-string">&quot;HuggingFaceM4/idefics-9b&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15svvjb">他の Transformers モデルと同様に、プロセッサとモデル自体をチェックポイントからロードする必要があります。
IDEFICS プロセッサは、<code>LlamaTokenizer</code> と IDEFICS 画像プロセッサを単一のプロセッサにラップして処理します。
モデルのテキストと画像の入力を準備します。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(checkpoint)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">&quot;auto&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8hdssd"><code>device_map</code><code>auto</code>に設定すると、モデルの重みを最も最適化された状態でロードおよび保存する方法が自動的に決定されます。
既存のデバイスを考慮した方法。</p> <h3 class="relative group"><a id="quantized-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantized-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantized model</span></h3> <p data-svelte-h="svelte-143oe1m">ハイメモリ GPU の可用性が問題となる場合は、モデルの量子化されたバージョンをロードできます。モデルと
プロセッサを 4 ビット精度で使用する場合、<code>BitsAndBytesConfig</code><code>from_pretrained</code>メソッドに渡すと、モデルが圧縮されます。
ロード中にその場で。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
<span class="hljs-meta">&gt;&gt;&gt; </span>quantization_config = BitsAndBytesConfig(
<span class="hljs-meta">... </span> load_in_4bit=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span> bnb_4bit_compute_dtype=torch.float16,
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(checkpoint)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = IdeficsForVisionText2Text.from_pretrained(
<span class="hljs-meta">... </span> checkpoint,
<span class="hljs-meta">... </span> quantization_config=quantization_config,
<span class="hljs-meta">... </span> device_map=<span class="hljs-string">&quot;auto&quot;</span>
<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1553duw">提案された方法のいずれかでモデルをロードしたので、IDEFICS を使用できるタスクの探索に進みましょう。</p> <h2 class="relative group"><a id="image-captioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-captioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image captioning</span></h2> <p data-svelte-h="svelte-1313v8o">画像のキャプション付けは、特定の画像のキャプションを予測するタスクです。一般的な用途は視覚障害者を支援することです
人々はさまざまな状況をナビゲートします。たとえば、オンラインで画像コンテンツを探索します。</p> <p data-svelte-h="svelte-5ecawm">タスクを説明するには、キャプションを付ける画像を取得します。例:</p> <div class="flex justify-center" data-svelte-h="svelte-t8y7db"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-im-captioning.jpg" alt="Image of a puppy in a flower bed"></div> <p data-svelte-h="svelte-1ilja47">写真提供:<a href="https://unsplash.com/@hendoo" rel="nofollow">Hendo Wang</a></p> <p data-svelte-h="svelte-1j57a9">IDEFICS はテキストと画像のプロンプトを受け入れます。ただし、画像にキャプションを付けるには、テキスト プロンプトをユーザーに提供する必要はありません。
モデル、前処理された入力画像のみ。テキスト プロンプトがない場合、モデルはテキストの生成を開始します。
BOS (Beginning-of-sequence) トークンによりキャプションが作成されます。</p> <p data-svelte-h="svelte-7tpjl6">モデルへの画像入力として、画像オブジェクト (<code>PIL.Image</code>) または画像を取得できる URL のいずれかを使用できます。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3542&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
A puppy <span class="hljs-keyword">in</span> a flower bed<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-9rnfb5">増加時に発生するエラーを避けるために、<code>generate</code>の呼び出しに<code>bad_words_ids</code>を含めることをお勧めします。
<code>max_new_tokens</code>: モデルは、新しい <code>&lt;image&gt;</code> または <code>&lt;fake_token_around_image&gt;</code> トークンを生成する必要があります。
モデルによって画像が生成されていません。
このガイドのようにオンザフライで設定することも、<a href="../generation_strategies">テキスト生成戦略</a> ガイドで説明されているように <code>GenerationConfig</code> に保存することもできます。</p></div> <h2 class="relative group"><a id="prompted-image-captioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prompted-image-captioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prompted image captioning</span></h2> <p data-svelte-h="svelte-1qmjkd7">テキスト プロンプトを提供することで画像キャプションを拡張でき、モデルは画像を指定して続行します。持っていきましょう
別の図で説明します。</p> <div class="flex justify-center" data-svelte-h="svelte-1ritb1k"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-prompted-im-captioning.jpg" alt="Image of the Eiffel Tower at night"></div> <p data-svelte-h="svelte-1dav10a">写真提供:<a href="https://unsplash.com/@dnevozhai" rel="nofollow">Denys Nevozhai</a></p> <p data-svelte-h="svelte-1wtxpzx">テキストおよび画像のプロンプトを単一のリストとしてモデルのプロセッサに渡し、適切な入力を作成できます。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3501&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
This <span class="hljs-keyword">is</span> an image of the Eiffel Tower <span class="hljs-keyword">in</span> Paris, France.<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="few-shot-prompting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#few-shot-prompting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Few-shot prompting</span></h2> <p data-svelte-h="svelte-1q30vzg">IDEFICS はゼロショットで優れた結果を示しますが、タスクによっては特定の形式のキャプションが必要になる場合や、キャプションが付属する場合があります。
タスクの複雑さを増大させるその他の制限または要件。少数のショットのプロンプトを使用して、コンテキスト内の学習を有効にすることができます。
プロンプトに例を指定することで、指定された例の形式を模倣した結果を生成するようにモデルを操作できます。</p> <p data-svelte-h="svelte-5u2x5j">前のエッフェル塔の画像をモデルの例として使用し、モデルにデモンストレーションするプロンプトを作成してみましょう。
画像内のオブジェクトが何であるかを知ることに加えて、それに関する興味深い情報も取得したいと考えています。
次に、自由の女神の画像に対して同じ応答形式を取得できるかどうかを見てみましょう。</p> <div class="flex justify-center" data-svelte-h="svelte-gin1vp"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg" alt="Image of the Statue of Liberty"></div> <p data-svelte-h="svelte-fndx8">写真提供:<a href="https://unsplash.com/@jmayobres" rel="nofollow">Juan Mayobre</a></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [<span class="hljs-string">&quot;User:&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3501&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;User:&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3387&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Describe this image.\nAssistant:&quot;</span>
<span class="hljs-meta">... </span> ]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">30</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
User: Describe this image.
Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower <span class="hljs-keyword">is</span> the same height <span class="hljs-keyword">as</span> an <span class="hljs-number">81</span>-storey building.
User: Describe this image.
Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty <span class="hljs-keyword">is</span> <span class="hljs-number">151</span> feet tall.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rfr59p">モデルは 1 つの例 (つまり、1 ショット) だけからタスクの実行方法を学習していることに注目してください。より複雑なタスクの場合は、
より多くの例 (3 ショット、5 ショットなど) を自由に試してみてください。</p> <h2 class="relative group"><a id="visual-question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#visual-question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Visual question answering</span></h2> <p data-svelte-h="svelte-1fvayaj">Visual Question Answering (VQA) は、画像に基づいて自由形式の質問に答えるタスクです。画像に似ている
キャプションは、アクセシビリティ アプリケーションだけでなく、教育 (視覚資料についての推論) にも使用できます。
サービス(画像を基にした商品に関する質問)、画像検索など。</p> <p data-svelte-h="svelte-tndyc2">このタスク用に新しい画像を取得しましょう。</p> <div class="flex justify-center" data-svelte-h="svelte-1j2xr8e"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"></div> <p data-svelte-h="svelte-7xhi3r">写真提供 <a href="https://unsplash.com/@jarritos" rel="nofollow">Jarritos Mexican Soda</a>.</p> <p data-svelte-h="svelte-jun1c7">適切な指示をプロンプトすることで、モデルを画像キャプションから視覚的な質問への応答に導くことができます。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Instruction: Provide an answer to the question. Use the image to answer.\n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Question: Where are these people and what&#x27;s the weather like? Answer:&quot;</span>
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">20</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
Instruction: Provide an answer to the question. Use the image to answer.
Question: Where are these people <span class="hljs-keyword">and</span> what<span class="hljs-string">&#x27;s the weather like? Answer: They&#x27;</span>re <span class="hljs-keyword">in</span> a park <span class="hljs-keyword">in</span> New York City, <span class="hljs-keyword">and</span> it<span class="hljs-string">&#x27;s a beautiful day.</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="image-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image classification</span></h2> <p data-svelte-h="svelte-9bgfkp">IDEFICS は、次のデータを含むデータについて明示的にトレーニングしなくても、画像をさまざまなカテゴリに分類できます。
これらの特定のカテゴリからのラベル付きの例。カテゴリのリストを指定し、その画像とテキストを使用して理解する
機能を利用すると、モデルは画像がどのカテゴリに属する​​可能性が高いかを推測できます。</p> <p data-svelte-h="svelte-9c3uvw">たとえば、次のような野菜スタンドの画像があるとします。</p> <div class="flex justify-center" data-svelte-h="svelte-g02ga3"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-classification.jpg" alt="Image of a vegetable stand"></div> <p data-svelte-h="svelte-1fktjb1">写真提供:<a href="https://unsplash.com/@peterwendt" rel="nofollow">Peter Wendt</a></p> <p data-svelte-h="svelte-bpjvv6">画像を次のいずれかのカテゴリに分類するようにモデルに指示できます。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>categories = [<span class="hljs-string">&#x27;animals&#x27;</span>,<span class="hljs-string">&#x27;vegetables&#x27;</span>, <span class="hljs-string">&#x27;city landscape&#x27;</span>, <span class="hljs-string">&#x27;cars&#x27;</span>, <span class="hljs-string">&#x27;office&#x27;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [<span class="hljs-string">f&quot;Instruction: Classify the following image into a single category from the following list: <span class="hljs-subst">{categories}</span>.\n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Category: &quot;</span>
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">6</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
Instruction: Classify the following image into a single category <span class="hljs-keyword">from</span> the following <span class="hljs-built_in">list</span>: [<span class="hljs-string">&#x27;animals&#x27;</span>, <span class="hljs-string">&#x27;vegetables&#x27;</span>, <span class="hljs-string">&#x27;city landscape&#x27;</span>, <span class="hljs-string">&#x27;cars&#x27;</span>, <span class="hljs-string">&#x27;office&#x27;</span>].
Category: Vegetables<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2873id">上の例では、画像を 1 つのカテゴリに分類するようにモデルに指示していますが、ランク分類を行うようにモデルに指示することもできます。</p> <h2 class="relative group"><a id="image-guided-text-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-guided-text-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image-guided text generation</span></h2> <p data-svelte-h="svelte-iteb8q">よりクリエイティブなアプリケーションの場合は、画像ガイド付きテキスト生成を使用して、画像に基づいてテキストを生成できます。これは可能です
製品、広告、シーンの説明などを作成するのに役立ちます。</p> <p data-svelte-h="svelte-1blzzm4">IDEFICS に、赤いドアの単純な画像に基づいてストーリーを書くように促してみましょう。</p> <div class="flex justify-center" data-svelte-h="svelte-1mf93u3"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-story-generation.jpg" alt="Image of a red door with a pumpkin on the steps"></div> <p data-svelte-h="svelte-4ctgu0">写真提供:<a href="https://unsplash.com/@devonshiremedia" rel="nofollow">Craig Tidball</a></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [<span class="hljs-string">&quot;Instruction: Use the image to write a story. \n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=2203&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Story: \n&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, num_beams=<span class="hljs-number">2</span>, max_new_tokens=<span class="hljs-number">200</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
Instruction: Use the image to write a story.
Story:
Once upon a time, there was a little girl who lived <span class="hljs-keyword">in</span> a house <span class="hljs-keyword">with</span> a red door. She loved her red door. It was the prettiest door <span class="hljs-keyword">in</span> the whole world.
One day, the little girl was playing <span class="hljs-keyword">in</span> her yard when she noticed a man standing on her doorstep. He was wearing a long black coat <span class="hljs-keyword">and</span> a top hat.
The little girl ran inside <span class="hljs-keyword">and</span> told her mother about the man.
Her mother said, “Don’t worry, honey. He’s just a friendly ghost.”
The little girl wasn’t sure <span class="hljs-keyword">if</span> she believed her mother, but she went outside anyway.
When she got to the door, the man was gone.
The <span class="hljs-built_in">next</span> day, the little girl was playing <span class="hljs-keyword">in</span> her yard again when she noticed the man standing on her doorstep.
He was wearing a long black coat <span class="hljs-keyword">and</span> a top hat.
The little girl ran<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hxn4p8">IDEFICS は玄関先にあるカボチャに気づき、幽霊に関する不気味なハロウィーンの話をしたようです。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-4b52t2">このような長い出力の場合、テキスト生成戦略を微調整すると大きなメリットが得られます。これは役に立ちます
生成される出力の品質が大幅に向上します。 <a href="../generation_strategies">テキスト生成戦略</a> を確認してください。
詳しく知ることができ。</p></div> <h2 class="relative group"><a id="running-inference-in-batch-mode" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-inference-in-batch-mode"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running inference in batch mode</span></h2> <p data-svelte-h="svelte-1x78e76">これまでのすべてのセクションでは、IDEFICS を 1 つの例として説明しました。非常に似た方法で、推論を実行できます。
プロンプトのリストを渡すことにより、サンプルのバッチを取得します。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [
<span class="hljs-meta">... </span> [ <span class="hljs-string">&quot;https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3501&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span> [ <span class="hljs-string">&quot;https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span> [ <span class="hljs-string">&quot;https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompts, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">for</span> i,t <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(generated_text):
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{i}</span>:\n<span class="hljs-subst">{t}</span>\n&quot;</span>)
<span class="hljs-number">0</span>:
This <span class="hljs-keyword">is</span> an image of the Eiffel Tower <span class="hljs-keyword">in</span> Paris, France.
<span class="hljs-number">1</span>:
This <span class="hljs-keyword">is</span> an image of a couple on a picnic blanket.
<span class="hljs-number">2</span>:
This <span class="hljs-keyword">is</span> an image of a vegetable stand.<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="idefics-instruct-for-conversational-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#idefics-instruct-for-conversational-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>IDEFICS instruct for conversational use</span></h2> <p data-svelte-h="svelte-bb91xj">会話型のユースケースの場合は、🤗 ハブでモデルの微調整された指示されたバージョンを見つけることができます。
<code>HuggingFaceM4/idefics-80b-instruct</code> および <code>HuggingFaceM4/idefics-9b-instruct</code></p> <p data-svelte-h="svelte-2yij83">これらのチェックポイントは、教師ありモデルと命令モデルを組み合わせたそれぞれの基本モデルを微調整した結果です。
データセットを微調整することで、ダウンストリームのパフォーマンスを向上させながら、会話設定でモデルをより使いやすくします。</p> <p data-svelte-h="svelte-rc9g0e">会話での使用とプロンプトは、基本モデルの使用と非常に似ています。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor
<span class="hljs-meta">&gt;&gt;&gt; </span>device = <span class="hljs-string">&quot;cuda&quot;</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;cpu&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>checkpoint = <span class="hljs-string">&quot;HuggingFaceM4/idefics-9b-instruct&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(checkpoint)
<span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [
<span class="hljs-meta">... </span> [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;User: What is in this image?&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;&lt;end_of_utterance&gt;&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.&lt;end_of_utterance&gt;&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;\nUser:&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;And who is that?&lt;end_of_utterance&gt;&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;\nAssistant:&quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># --batched mode</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompts, add_end_of_utterance_token=<span class="hljs-literal">False</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(device)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># --single sample mode</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># inputs = processor(prompts[0], return_tensors=&quot;pt&quot;).to(device)</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Generation args</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>exit_condition = processor.tokenizer(<span class="hljs-string">&quot;&lt;end_of_utterance&gt;&quot;</span>, add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=<span class="hljs-number">100</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">for</span> i, t <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(generated_text):
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{i}</span>:\n<span class="hljs-subst">{t}</span>\n&quot;</span>)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ja/tasks/idefics.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_jement = {
assets: "/docs/transformers/main/ja",
base: "/docs/transformers/main/ja",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/ja/_app/immutable/entry/start.1486e459.js"),
import("/docs/transformers/main/ja/_app/immutable/entry/app.d9ae818f.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 138],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
70.6 kB
·
Xet hash:
ae7ba07d792ac2ed3d4e44480e868a9e998e576adf2abe401b8f3cc5b242a026

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.