Buckets:

rtrm's picture
download
raw
69.6 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;IDEFICS를 이용한 이미지 작업&quot;,&quot;local&quot;:&quot;image-tasks-with-idefics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;모델 로드&quot;,&quot;local&quot;:&quot;loading-the-model&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;양자화된 모델&quot;,&quot;local&quot;:&quot;quantized-model&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;이미지 캡셔닝&quot;,&quot;local&quot;:&quot;image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;프롬프트 이미지 캡셔닝&quot;,&quot;local&quot;:&quot;prompted-image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;퓨샷 프롬프트&quot;,&quot;local&quot;:&quot;few-shot-prompting&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;시각적 질의 응답&quot;,&quot;local&quot;:&quot;visual-question-answering&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;이미지 분류&quot;,&quot;local&quot;:&quot;image-classification&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;이미지 기반 텍스트 생성&quot;,&quot;local&quot;:&quot;image-guided-text-generation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;배치 모드에서 추론 실행&quot;,&quot;local&quot;:&quot;running-inference-in-batch-mode&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;대화형 사용을 위한 IDEFICS 인스트럭트 실행&quot;,&quot;local&quot;:&quot;idefics-instruct-for-conversational-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/ko/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/start.9aa88961.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/scheduler.9bc65507.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/singletons.9eec45c3.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.3b203c72.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/paths.566078f7.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/app.84fb67c3.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.707bf1b6.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/0.1c99376b.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/64.49c20405.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/Tip.c2ecdbf4.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/CodeBlock.54a9f38d.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/DocNotebookDropdown.41f65cb5.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/EditOnGithub.922df6ba.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;IDEFICS를 이용한 이미지 작업&quot;,&quot;local&quot;:&quot;image-tasks-with-idefics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;모델 로드&quot;,&quot;local&quot;:&quot;loading-the-model&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;양자화된 모델&quot;,&quot;local&quot;:&quot;quantized-model&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;이미지 캡셔닝&quot;,&quot;local&quot;:&quot;image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;프롬프트 이미지 캡셔닝&quot;,&quot;local&quot;:&quot;prompted-image-captioning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;퓨샷 프롬프트&quot;,&quot;local&quot;:&quot;few-shot-prompting&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;시각적 질의 응답&quot;,&quot;local&quot;:&quot;visual-question-answering&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;이미지 분류&quot;,&quot;local&quot;:&quot;image-classification&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;이미지 기반 텍스트 생성&quot;,&quot;local&quot;:&quot;image-guided-text-generation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;배치 모드에서 추론 실행&quot;,&quot;local&quot;:&quot;running-inference-in-batch-mode&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;대화형 사용을 위한 IDEFICS 인스트럭트 실행&quot;,&quot;local&quot;:&quot;idefics-instruct-for-conversational-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="image-tasks-with-idefics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-tasks-with-idefics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>IDEFICS를 이용한 이미지 작업</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-192zl04">개별 작업은 특화된 모델을 미세 조정하여 처리할 수 있지만, 최근 등장하여 인기를 얻고 있는 방식은 대규모 모델을 미세 조정 없이 다양한 작업에 사용하는 것입니다. 예를 들어, 대규모 언어 모델은 요약, 번역, 분류 등과 같은 자연어처리 (NLP) 작업을 처리할 수 있습니다. 이 접근 방식은 텍스트와 같은 단일 모달리티에 국한되지 않으며, 이 가이드에서는 IDEFICS라는 대규모 멀티모달 모델을 사용하여 이미지-텍스트 작업을 다루는 방법을 설명합니다.</p> <p data-svelte-h="svelte-6ga4od"><a href="../model_doc/idefics">IDEFICS</a><a href="https://huggingface.co/papers/2204.14198" rel="nofollow">Flamingo</a>를 기반으로 하는 오픈 액세스 비전 및 언어 모델로, DeepMind에서 처음 개발한 최신 시각 언어 모델입니다. 이 모델은 임의의 이미지 및 텍스트 입력 시퀀스를 받아 일관성 있는 텍스트를 출력으로 생성합니다. 이미지에 대한 질문에 답변하고, 시각적인 내용을 설명하며, 여러 이미지에 기반한 이야기를 생성하는 등 다양한 작업을 수행할 수 있습니다. IDEFICS는 <a href="https://huggingface.co/HuggingFaceM4/idefics-80b" rel="nofollow">800억 파라미터</a><a href="https://huggingface.co/HuggingFaceM4/idefics-9b" rel="nofollow">90억 파라미터</a> 두 가지 버전을 제공하며, 두 버전 모두 🤗 Hub에서 이용할 수 있습니다. 각 버전에는 대화형 사용 사례에 맞게 미세 조정된 버전도 있습니다.</p> <p data-svelte-h="svelte-1br8orr">이 모델은 매우 다재다능하며 광범위한 이미지 및 멀티모달 작업에 사용될 수 있습니다. 그러나 대규모 모델이기 때문에 상당한 컴퓨팅 자원과 인프라가 필요합니다. 각 개별 작업에 특화된 모델을 미세 조정하는 것보다 모델을 그대로 사용하는 것이 더 적합한지는 사용자가 판단해야 합니다.</p> <p data-svelte-h="svelte-1lptr1j">이 가이드에서는 다음을 배우게 됩니다:</p> <ul data-svelte-h="svelte-6sx3yz"><li><a href="#loading-the-model">IDEFICS 로드하기</a><a href="#quantized-model">양자화된 버전의 모델 로드하기</a></li> <li>IDEFICS를 사용하여:<ul><li><a href="#image-captioning">이미지 캡셔닝</a></li> <li><a href="#prompted-image-captioning">프롬프트 이미지 캡셔닝</a></li> <li><a href="#few-shot-prompting">퓨샷 프롬프트</a></li> <li><a href="#visual-question-answering">시각적 질의 응답</a></li> <li><a href="#image-classification">이미지 분류</a></li> <li><a href="#image-guided-text-generation">이미지 기반 텍스트 생성</a></li></ul></li> <li><a href="#running-inference-in-batch-mode">배치 모드에서 추론 실행</a></li> <li><a href="#idefics-instruct-for-conversational-use">대화형 사용을 위한 IDEFICS 인스트럭트 실행</a></li></ul> <p data-svelte-h="svelte-1nxgok6">시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install -q bitsandbytes sentencepiece accelerate transformers<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">다음 예제를 비양자화된 버전의 모델 체크포인트로 실행하려면 최소 20GB의 GPU 메모리가 필요합니다.</div> <h2 class="relative group"><a id="loading-the-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-the-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>모델 로드</span></h2> <p data-svelte-h="svelte-vbxrcp">모델을 90억 파라미터 버전의 체크포인트로 로드해 봅시다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>checkpoint = <span class="hljs-string">&quot;HuggingFaceM4/idefics-9b&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1f7xidq">다른 Transformers 모델과 마찬가지로, 체크포인트에서 프로세서와 모델 자체를 로드해야 합니다.
IDEFICS 프로세서는 <a href="/docs/transformers/main/ko/model_doc/llama#transformers.LlamaTokenizer">LlamaTokenizer</a>와 IDEFICS 이미지 프로세서를 하나의 프로세서로 감싸서 텍스트와 이미지 입력을 모델에 맞게 준비합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(checkpoint)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">&quot;auto&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kvjr8l"><code>device_map</code><code>&quot;auto&quot;</code>로 설정하면 사용 중인 장치를 고려하여 모델 가중치를 가장 최적화된 방식으로 로드하고 저장하는 방법을 자동으로 결정합니다.</p> <h3 class="relative group"><a id="quantized-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantized-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>양자화된 모델</span></h3> <p data-svelte-h="svelte-1su4p88">고용량 GPU 사용이 어려운 경우, 모델의 양자화된 버전을 로드할 수 있습니다. 모델과 프로세서를 4비트 정밀도로 로드하기 위해서, <code>from_pretrained</code> 메소드에 <code>BitsAndBytesConfig</code>를 전달하면 모델이 로드되는 동안 실시간으로 압축됩니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
<span class="hljs-meta">&gt;&gt;&gt; </span>quantization_config = BitsAndBytesConfig(
<span class="hljs-meta">... </span> load_in_4bit=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span> bnb_4bit_compute_dtype=torch.float16,
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(checkpoint)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = IdeficsForVisionText2Text.from_pretrained(
<span class="hljs-meta">... </span> checkpoint,
<span class="hljs-meta">... </span> quantization_config=quantization_config,
<span class="hljs-meta">... </span> device_map=<span class="hljs-string">&quot;auto&quot;</span>
<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ve7k0">이제 모델을 제안된 방법 중 하나로 로드했으니, IDEFICS를 사용할 수 있는 작업들을 탐구해봅시다.</p> <h2 class="relative group"><a id="image-captioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-captioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>이미지 캡셔닝</span></h2> <p data-svelte-h="svelte-1qqb456">이미지 캡셔닝은 주어진 이미지에 대한 캡션을 예측하는 작업입니다. 일반적인 응용 분야는 시각 장애인이 다양한 상황을 탐색할 수 있도록 돕는 것입니다. 예를 들어, 온라인에서 이미지 콘텐츠를 탐색하는 데 도움을 줄 수 있습니다.</p> <p data-svelte-h="svelte-5ubear">작업을 설명하기 위해 캡션을 달 이미지 예시를 가져옵니다. 예시:</p> <div class="flex justify-center" data-svelte-h="svelte-t8y7db"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-im-captioning.jpg" alt="Image of a puppy in a flower bed"></div> <p data-svelte-h="svelte-cqbxmp">사진 제공: <a href="https://unsplash.com/@hendoo" rel="nofollow">Hendo Wang</a>.</p> <p data-svelte-h="svelte-cqnpwc">IDEFICS는 텍스트 및 이미지 프롬프트를 모두 수용합니다. 그러나 이미지를 캡션하기 위해 모델에 텍스트 프롬프트를 제공할 필요는 없습니다. 전처리된 입력 이미지만 제공하면 됩니다. 텍스트 프롬프트 없이 모델은 BOS(시퀀스 시작) 토큰부터 텍스트 생성을 시작하여 캡션을 만듭니다.</p> <p data-svelte-h="svelte-1xlv5ti">모델에 이미지 입력으로는 이미지 객체(<code>PIL.Image</code>) 또는 이미지를 가져올 수 있는 URL을 사용할 수 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3542&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
A puppy <span class="hljs-keyword">in</span> a flower bed<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-197yhnw"><code>max_new_tokens</code>의 크기를 증가시킬 때 발생할 수 있는 오류를 피하기 위해 <code>generate</code> 호출 시 <code>bad_words_ids</code>를 포함하는 것이 좋습니다. 모델로부터 생성된 이미지가 없을 때 새로운 <code>&lt;image&gt;</code> 또는 <code>&lt;fake_token_around_image&gt;</code> 토큰을 생성하려고 하기 때문입니다.
이 가이드에서처럼 <code>bad_words_ids</code>를 함수 호출 시에 매개변수로 설정하거나, <a href="../generation_strategies">텍스트 생성 전략</a> 가이드에 설명된 대로 <code>GenerationConfig</code>에 저장할 수도 있습니다.</p></div> <h2 class="relative group"><a id="prompted-image-captioning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prompted-image-captioning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>프롬프트 이미지 캡셔닝</span></h2> <p data-svelte-h="svelte-yxenqx">텍스트 프롬프트를 이용하여 이미지 캡셔닝을 확장할 수 있으며, 모델은 주어진 이미지를 바탕으로 텍스트를 계속 생성합니다. 다음 이미지를 예시로 들어보겠습니다:</p> <div class="flex justify-center" data-svelte-h="svelte-1ritb1k"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-prompted-im-captioning.jpg" alt="Image of the Eiffel Tower at night"></div> <p data-svelte-h="svelte-2sdmkq">사진 제공: <a href="https://unsplash.com/@dnevozhai" rel="nofollow">Denys Nevozhai</a>.</p> <p data-svelte-h="svelte-1qyvy5u">텍스트 및 이미지 프롬프트는 적절한 입력을 생성하기 위해 모델의 프로세서에 하나의 목록으로 전달될 수 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3501&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
This <span class="hljs-keyword">is</span> an image of the Eiffel Tower <span class="hljs-keyword">in</span> Paris, France.<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="few-shot-prompting" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#few-shot-prompting"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>퓨샷 프롬프트</span></h2> <p data-svelte-h="svelte-t289km">IDEFICS는 훌륭한 제로샷 결과를 보여주지만, 작업에 특정 형식의 캡션이 필요하거나 작업의 복잡성을 높이는 다른 제한 사항이나 요구 사항이 있을 수 있습니다. 이럴 때 퓨샷 프롬프트를 사용하여 맥락 내 학습(In-Context Learning)을 가능하게 할 수 있습니다.
프롬프트에 예시를 제공함으로써 모델이 주어진 예시의 형식을 모방한 결과를 생성하도록 유도할 수 있습니다.</p> <p data-svelte-h="svelte-3v80oi">이전의 에펠탑 이미지를 모델에 예시로 사용하고, 모델에게 이미지의 객체를 학습하는 것 외에도 흥미로운 정보를 얻고 싶다는 것을 보여주는 프롬프트를 작성해 봅시다.
그런 다음 자유의 여신상 이미지에 대해 동일한 응답 형식을 얻을 수 있는지 확인해 봅시다:</p> <div class="flex justify-center" data-svelte-h="svelte-gin1vp"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg" alt="Image of the Statue of Liberty"></div> <p data-svelte-h="svelte-1c7akyk">사진 제공: <a href="https://unsplash.com/@jmayobres" rel="nofollow">Juan Mayobre</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [<span class="hljs-string">&quot;User:&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3501&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;User:&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3387&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Describe this image.\nAssistant:&quot;</span>
<span class="hljs-meta">... </span> ]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">30</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
User: Describe this image.
Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower <span class="hljs-keyword">is</span> the same height <span class="hljs-keyword">as</span> an <span class="hljs-number">81</span>-storey building.
User: Describe this image.
Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty <span class="hljs-keyword">is</span> <span class="hljs-number">151</span> feet tall.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4qx2vi">단 하나의 예시만으로도(즉, 1-shot) 모델이 작업 수행 방법을 학습했다는 점이 주목할 만합니다. 더 복잡한 작업의 경우, 더 많은 예시(예: 3-shot, 5-shot 등)를 사용하여 실험해 보는 것도 좋은 방법입니다.</p> <h2 class="relative group"><a id="visual-question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#visual-question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>시각적 질의 응답</span></h2> <p data-svelte-h="svelte-1c8lsvj">시각적 질의 응답(VQA)은 이미지를 기반으로 개방형 질문에 답하는 작업입니다. 이미지 캡셔닝과 마찬가지로 접근성 애플리케이션에서 사용할 수 있지만, 교육(시각 자료에 대한 추론), 고객 서비스(이미지를 기반으로 한 제품 질문), 이미지 검색 등에서도 사용할 수 있습니다.</p> <p data-svelte-h="svelte-i2mdmn">이 작업을 위해 새로운 이미지를 가져옵니다:</p> <div class="flex justify-center" data-svelte-h="svelte-1j2xr8e"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"></div> <p data-svelte-h="svelte-1d2pqeh">사진 제공: <a href="https://unsplash.com/@jarritos" rel="nofollow">Jarritos Mexican Soda</a>.</p> <p data-svelte-h="svelte-1hqholj">적절한 지시문을 사용하면 이미지 캡셔닝에서 시각적 질의 응답으로 모델을 유도할 수 있습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Instruction: Provide an answer to the question. Use the image to answer.\n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Question: Where are these people and what&#x27;s the weather like? Answer:&quot;</span>
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">20</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
Instruction: Provide an answer to the question. Use the image to answer.
Question: Where are these people <span class="hljs-keyword">and</span> what<span class="hljs-string">&#x27;s the weather like? Answer: They&#x27;</span>re <span class="hljs-keyword">in</span> a park <span class="hljs-keyword">in</span> New York City, <span class="hljs-keyword">and</span> it<span class="hljs-string">&#x27;s a beautiful day.</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="image-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>이미지 분류</span></h2> <p data-svelte-h="svelte-1367av1">IDEFICS는 특정 카테고리의 라벨이 포함된 데이터로 명시적으로 학습되지 않아도 이미지를 다양한 카테고리로 분류할 수 있습니다. 카테고리 목록이 주어지면, 모델은 이미지와 텍스트 이해 능력을 사용하여 이미지가 속할 가능성이 높은 카테고리를 추론할 수 있습니다.</p> <p data-svelte-h="svelte-18ksnpv">여기에 야채 가판대 이미지가 있습니다.</p> <div class="flex justify-center" data-svelte-h="svelte-g02ga3"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-classification.jpg" alt="Image of a vegetable stand"></div> <p data-svelte-h="svelte-133qla9">사진 제공: <a href="https://unsplash.com/@peterwendt" rel="nofollow">Peter Wendt</a>.</p> <p data-svelte-h="svelte-cagop">우리는 모델에게 우리가 가진 카테고리 중 하나로 이미지를 분류하도록 지시할 수 있습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>categories = [<span class="hljs-string">&#x27;animals&#x27;</span>,<span class="hljs-string">&#x27;vegetables&#x27;</span>, <span class="hljs-string">&#x27;city landscape&#x27;</span>, <span class="hljs-string">&#x27;cars&#x27;</span>, <span class="hljs-string">&#x27;office&#x27;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [<span class="hljs-string">f&quot;Instruction: Classify the following image into a single category from the following list: <span class="hljs-subst">{categories}</span>.\n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Category: &quot;</span>
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">6</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
Instruction: Classify the following image into a single category <span class="hljs-keyword">from</span> the following <span class="hljs-built_in">list</span>: [<span class="hljs-string">&#x27;animals&#x27;</span>, <span class="hljs-string">&#x27;vegetables&#x27;</span>, <span class="hljs-string">&#x27;city landscape&#x27;</span>, <span class="hljs-string">&#x27;cars&#x27;</span>, <span class="hljs-string">&#x27;office&#x27;</span>].
Category: Vegetables<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-pzk7p8">위 예제에서는 모델에게 이미지를 단일 카테고리로 분류하도록 지시했지만, 순위 분류를 하도록 모델에 프롬프트를 제공할 수도 있습니다.</p> <h2 class="relative group"><a id="image-guided-text-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-guided-text-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>이미지 기반 텍스트 생성</span></h2> <p data-svelte-h="svelte-1snw1di">이미지를 활용한 텍스트 생성 기술을 사용하면 더욱 창의적인 작업이 가능합니다. 이 기술은 이미지를 바탕으로 텍스트를 만들어내며, 제품 설명, 광고 문구, 장면 묘사 등 다양한 용도로 활용할 수 있습니다.</p> <p data-svelte-h="svelte-1n1vxz8">간단한 예로, 빨간 문 이미지를 IDEFICS에 입력하여 이야기를 만들어보겠습니다:</p> <div class="flex justify-center" data-svelte-h="svelte-1mf93u3"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-story-generation.jpg" alt="Image of a red door with a pumpkin on the steps"></div> <p data-svelte-h="svelte-s6ch7k">사진 제공: <a href="https://unsplash.com/@devonshiremedia" rel="nofollow">Craig Tidball</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompt = [<span class="hljs-string">&quot;Instruction: Use the image to write a story. \n&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=2203&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;Story: \n&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, num_beams=<span class="hljs-number">2</span>, max_new_tokens=<span class="hljs-number">200</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(generated_text[<span class="hljs-number">0</span>])
Instruction: Use the image to write a story.
Story:
Once upon a time, there was a little girl who lived <span class="hljs-keyword">in</span> a house <span class="hljs-keyword">with</span> a red door. She loved her red door. It was the prettiest door <span class="hljs-keyword">in</span> the whole world.
One day, the little girl was playing <span class="hljs-keyword">in</span> her yard when she noticed a man standing on her doorstep. He was wearing a long black coat <span class="hljs-keyword">and</span> a top hat.
The little girl ran inside <span class="hljs-keyword">and</span> told her mother about the man.
Her mother said, “Don’t worry, honey. He’s just a friendly ghost.”
The little girl wasn’t sure <span class="hljs-keyword">if</span> she believed her mother, but she went outside anyway.
When she got to the door, the man was gone.
The <span class="hljs-built_in">next</span> day, the little girl was playing <span class="hljs-keyword">in</span> her yard again when she noticed the man standing on her doorstep.
He was wearing a long black coat <span class="hljs-keyword">and</span> a top hat.
The little girl ran<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nhpk1e">IDEFICS가 문 앞에 있는 호박을 보고 유령에 대한 으스스한 할로윈 이야기를 만든 것 같습니다.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-tpw64k">이처럼 긴 텍스트를 생성할 때는 텍스트 생성 전략을 조정하는 것이 좋습니다. 이렇게 하면 생성된 결과물의 품질을 크게 향상시킬 수 있습니다. 자세한 내용은 <a href="../generation_strategies">텍스트 생성 전략</a>을 참조하세요.</p></div> <h2 class="relative group"><a id="running-inference-in-batch-mode" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-inference-in-batch-mode"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>배치 모드에서 추론 실행</span></h2> <p data-svelte-h="svelte-jth63i">앞선 모든 섹션에서는 단일 예시에 대해 IDEFICS를 설명했습니다. 이와 매우 유사한 방식으로, 프롬프트 목록을 전달하여 여러 예시에 대한 추론을 실행할 수 있습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [
<span class="hljs-meta">... </span> [ <span class="hljs-string">&quot;https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3501&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span> [ <span class="hljs-string">&quot;https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span> [ <span class="hljs-string">&quot;https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&amp;ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&amp;auto=format&amp;fit=crop&amp;w=3540&amp;q=80&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;This is an image of &quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompts, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, max_new_tokens=<span class="hljs-number">10</span>, bad_words_ids=bad_words_ids)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">for</span> i,t <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(generated_text):
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{i}</span>:\n<span class="hljs-subst">{t}</span>\n&quot;</span>)
<span class="hljs-number">0</span>:
This <span class="hljs-keyword">is</span> an image of the Eiffel Tower <span class="hljs-keyword">in</span> Paris, France.
<span class="hljs-number">1</span>:
This <span class="hljs-keyword">is</span> an image of a couple on a picnic blanket.
<span class="hljs-number">2</span>:
This <span class="hljs-keyword">is</span> an image of a vegetable stand.<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="idefics-instruct-for-conversational-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#idefics-instruct-for-conversational-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>대화형 사용을 위한 IDEFICS 인스트럭트 실행</span></h2> <p data-svelte-h="svelte-me51t0">대화형 사용 사례를 위해, 🤗 Hub에서 명령어 수행에 최적화된 버전의 모델을 찾을 수 있습니다. 이곳에는 <code>HuggingFaceM4/idefics-80b-instruct</code><code>HuggingFaceM4/idefics-9b-instruct</code>가 있습니다.</p> <p data-svelte-h="svelte-18svak1">이 체크포인트는 지도 학습 및 명령어 미세 조정 데이터셋의 혼합으로 각각의 기본 모델을 미세 조정한 결과입니다. 이를 통해 모델의 하위 작업 성능을 향상시키는 동시에 대화형 환경에서 모델을 더 사용하기 쉽게 합니다.</p> <p data-svelte-h="svelte-4akhs6">대화형 사용을 위한 사용법 및 프롬프트는 기본 모델을 사용하는 것과 매우 유사합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> IdeficsForVisionText2Text, AutoProcessor
<span class="hljs-meta">&gt;&gt;&gt; </span>device = <span class="hljs-string">&quot;cuda&quot;</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">&quot;cpu&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>checkpoint = <span class="hljs-string">&quot;HuggingFaceM4/idefics-9b-instruct&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(checkpoint)
<span class="hljs-meta">&gt;&gt;&gt; </span>prompts = [
<span class="hljs-meta">... </span> [
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;User: What is in this image?&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;&lt;end_of_utterance&gt;&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.&lt;end_of_utterance&gt;&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;\nUser:&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;And who is that?&lt;end_of_utterance&gt;&quot;</span>,
<span class="hljs-meta">... </span> <span class="hljs-string">&quot;\nAssistant:&quot;</span>,
<span class="hljs-meta">... </span> ],
<span class="hljs-meta">... </span>]
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># --batched mode</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = processor(prompts, add_end_of_utterance_token=<span class="hljs-literal">False</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(device)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># --single sample mode</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># inputs = processor(prompts[0], return_tensors=&quot;pt&quot;).to(device)</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># args 생성</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>exit_condition = processor.tokenizer(<span class="hljs-string">&quot;&lt;end_of_utterance&gt;&quot;</span>, add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>bad_words_ids = processor.tokenizer([<span class="hljs-string">&quot;&lt;image&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;fake_token_around_image&gt;&quot;</span>], add_special_tokens=<span class="hljs-literal">False</span>).input_ids
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=<span class="hljs-number">100</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>generated_text = processor.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">for</span> i, t <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(generated_text):
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{i}</span>:\n<span class="hljs-subst">{t}</span>\n&quot;</span>)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ko/tasks/idefics.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1hrx8 = {
assets: "/docs/transformers/main/ko",
base: "/docs/transformers/main/ko",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/ko/_app/immutable/entry/start.9aa88961.js"),
import("/docs/transformers/main/ko/_app/immutable/entry/app.84fb67c3.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 64],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
69.6 kB
·
Xet hash:
c621cca4523464c1aaeeb54b4664ed1bf237c0de85ca63e962cf52cff4f84b53

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.