Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33913 /en /task_summary.html

rtrm

3 months ago

download

raw

83.6 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"What 🤗 Transformers can do","local":"what--transformers-can-do","sections":[{"title":"Audio","local":"audio","sections":[{"title":"Audio classification","local":"audio-classification","sections":[],"depth":3},{"title":"Automatic speech recognition","local":"automatic-speech-recognition","sections":[],"depth":3}],"depth":2},{"title":"Computer vision","local":"computer-vision","sections":[{"title":"Image classification","local":"image-classification","sections":[],"depth":3},{"title":"Object detection","local":"object-detection","sections":[],"depth":3},{"title":"Image segmentation","local":"image-segmentation","sections":[],"depth":3},{"title":"Depth estimation","local":"depth-estimation","sections":[],"depth":3}],"depth":2},{"title":"Natural language processing","local":"natural-language-processing","sections":[{"title":"Text classification","local":"text-classification","sections":[],"depth":3},{"title":"Token classification","local":"token-classification","sections":[],"depth":3},{"title":"Question answering","local":"question-answering","sections":[],"depth":3},{"title":"Summarization","local":"summarization","sections":[],"depth":3},{"title":"Translation","local":"translation","sections":[],"depth":3},{"title":"Language modeling","local":"language-modeling","sections":[],"depth":3}],"depth":2},{"title":"Multimodal","local":"multimodal","sections":[{"title":"Document question answering","local":"document-question-answering","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/414.0eb3dad4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"What 🤗 Transformers can do","local":"what--transformers-can-do","sections":[{"title":"Audio","local":"audio","sections":[{"title":"Audio classification","local":"audio-classification","sections":[],"depth":3},{"title":"Automatic speech recognition","local":"automatic-speech-recognition","sections":[],"depth":3}],"depth":2},{"title":"Computer vision","local":"computer-vision","sections":[{"title":"Image classification","local":"image-classification","sections":[],"depth":3},{"title":"Object detection","local":"object-detection","sections":[],"depth":3},{"title":"Image segmentation","local":"image-segmentation","sections":[],"depth":3},{"title":"Depth estimation","local":"depth-estimation","sections":[],"depth":3}],"depth":2},{"title":"Natural language processing","local":"natural-language-processing","sections":[{"title":"Text classification","local":"text-classification","sections":[],"depth":3},{"title":"Token classification","local":"token-classification","sections":[],"depth":3},{"title":"Question answering","local":"question-answering","sections":[],"depth":3},{"title":"Summarization","local":"summarization","sections":[],"depth":3},{"title":"Translation","local":"translation","sections":[],"depth":3},{"title":"Language modeling","local":"language-modeling","sections":[],"depth":3}],"depth":2},{"title":"Multimodal","local":"multimodal","sections":[{"title":"Document question answering","local":"document-question-answering","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="what--transformers-can-do" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what--transformers-can-do"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What 🤗 Transformers can do</span></h1> <p data-svelte-h="svelte-16honzi">🤗 Transformers is a library of pretrained state-of-the-art models for natural language processing (NLP), computer vision, and audio and speech processing tasks. Not only does the library contain Transformer models, but it also has non-Transformer models like modern convolutional networks for computer vision tasks. If you look at some of the most popular consumer products today, like smartphones, apps, and televisions, odds are that some kind of deep learning technology is behind it. Want to remove a background object from a picture taken by your smartphone? This is an example of a panoptic segmentation task (don’t worry if you don’t know what this means yet, we’ll describe it in the following sections!).</p> <p data-svelte-h="svelte-co1bd8">This page provides an overview of the different speech and audio, computer vision, and NLP tasks that can be solved with the 🤗 Transformers library in just three lines of code!</p> <h2 class="relative group"><a id="audio" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#audio"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Audio</span></h2> <p data-svelte-h="svelte-y5bzkk">Audio and speech processing tasks are a little different from the other modalities mainly because audio as an input is a continuous signal. Unlike text, a raw audio waveform can’t be neatly split into discrete chunks the way a sentence can be divided into words. To get around this, the raw audio signal is typically sampled at regular intervals. If you take more samples within an interval, the sampling rate is higher, and the audio more closely resembles the original audio source.</p> <p data-svelte-h="svelte-1buqhp9">Previous approaches preprocessed the audio to extract useful features from it. It is now more common to start audio and speech processing tasks by directly feeding the raw audio waveform to a feature encoder to extract an audio representation. This simplifies the preprocessing step and allows the model to learn the most essential features.</p> <h3 class="relative group"><a id="audio-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#audio-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Audio classification</span></h3> <p data-svelte-h="svelte-wenpsc">Audio classification is a task that labels audio data from a predefined set of classes. It is a broad category with many specific applications, some of which include:</p> <ul data-svelte-h="svelte-ii0fya"><li>acoustic scene classification: label audio with a scene label (“office”, “beach”, “stadium”)</li> <li>acoustic event detection: label audio with a sound event label (“car horn”, “whale calling”, “glass breaking”)</li> <li>tagging: label audio containing multiple sounds (birdsongs, speaker identification in a meeting)</li> <li>music classification: label music with a genre label (“metal”, “hip-hop”, “country”)</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>classifier = pipeline(task=<span class="hljs-string">"audio-classification"</span>, model=<span class="hljs-string">"superb/hubert-base-superb-er"</span>)
	<span class="hljs-meta">>>> </span>preds = classifier(<span class="hljs-string">"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac"</span>)
	<span class="hljs-meta">>>> </span>preds = [{<span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>), <span class="hljs-string">"label"</span>: pred[<span class="hljs-string">"label"</span>]} <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds]
	<span class="hljs-meta">>>> </span>preds
	[{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.4532</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'hap'</span>},
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.3622</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'sad'</span>},
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.0943</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'neu'</span>},
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.0903</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'ang'</span>}]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="automatic-speech-recognition" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#automatic-speech-recognition"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Automatic speech recognition</span></h3> <p data-svelte-h="svelte-1uioxjn">Automatic speech recognition (ASR) transcribes speech into text. It is one of the most common audio tasks due partly to speech being such a natural form of human communication. Today, ASR systems are embedded in “smart” technology products like speakers, phones, and cars. We can ask our virtual assistants to play music, set reminders, and tell us the weather.</p> <p data-svelte-h="svelte-13nabhb">But one of the key challenges Transformer architectures have helped with is in low-resource languages. By pretraining on large amounts of speech data, finetuning the model on only one hour of labeled speech data in a low-resource language can still produce high-quality results compared to previous ASR systems trained on 100x more labeled data.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>transcriber = pipeline(task=<span class="hljs-string">"automatic-speech-recognition"</span>, model=<span class="hljs-string">"openai/whisper-small"</span>)
	<span class="hljs-meta">>>> </span>transcriber(<span class="hljs-string">"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac"</span>)
	{<span class="hljs-string">'text'</span>: <span class="hljs-string">' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'</span>}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="computer-vision" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#computer-vision"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Computer vision</span></h2> <p data-svelte-h="svelte-1iprewp">One of the first and earliest successful computer vision tasks was recognizing images of zip code numbers using a <a href="glossary#convolution">convolutional neural network (CNN)</a>. An image is composed of pixels, and each pixel has a numerical value. This makes it easy to represent an image as a matrix of pixel values. Each particular combination of pixel values describes the colors of an image.</p> <p data-svelte-h="svelte-9en3oh">Two general ways computer vision tasks can be solved are:</p> <ol data-svelte-h="svelte-1n2rw1y"><li>Use convolutions to learn the hierarchical features of an image from low-level features to high-level abstract things.</li> <li>Split an image into patches and use a Transformer to gradually learn how each image patch is related to each other to form an image. Unlike the bottom-up approach favored by a CNN, this is kind of like starting out with a blurry image and then gradually bringing it into focus.</li></ol> <h3 class="relative group"><a id="image-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image classification</span></h3> <p data-svelte-h="svelte-1xnawqn">Image classification labels an entire image from a predefined set of classes. Like most classification tasks, there are many practical use cases for image classification, some of which include:</p> <ul data-svelte-h="svelte-hzwblv"><li>healthcare: label medical images to detect disease or monitor patient health</li> <li>environment: label satellite images to monitor deforestation, inform wildland management or detect wildfires</li> <li>agriculture: label images of crops to monitor plant health or satellite images for land use monitoring</li> <li>ecology: label images of animal or plant species to monitor wildlife populations or track endangered species</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>classifier = pipeline(task=<span class="hljs-string">"image-classification"</span>)
	<span class="hljs-meta">>>> </span>preds = classifier(
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"</span>
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>preds = [{<span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>), <span class="hljs-string">"label"</span>: pred[<span class="hljs-string">"label"</span>]} <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds]
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(*preds, sep=<span class="hljs-string">"\n"</span>)
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.4335</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'lynx, catamount'</span>}
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.0348</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'</span>}
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.0324</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'snow leopard, ounce, Panthera uncia'</span>}
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.0239</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'Egyptian cat'</span>}
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.0229</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'tiger cat'</span>}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="object-detection" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#object-detection"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Object detection</span></h3> <p data-svelte-h="svelte-rzixgf">Unlike image classification, object detection identifies multiple objects within an image and the objects’ positions in an image (defined by the bounding box). Some example applications of object detection include:</p> <ul data-svelte-h="svelte-11ca9ev"><li>self-driving vehicles: detect everyday traffic objects such as other vehicles, pedestrians, and traffic lights</li> <li>remote sensing: disaster monitoring, urban planning, and weather forecasting</li> <li>defect detection: detect cracks or structural damage in buildings, and manufacturing defects</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>detector = pipeline(task=<span class="hljs-string">"object-detection"</span>)
	<span class="hljs-meta">>>> </span>preds = detector(
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"</span>
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>preds = [{<span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>), <span class="hljs-string">"label"</span>: pred[<span class="hljs-string">"label"</span>], <span class="hljs-string">"box"</span>: pred[<span class="hljs-string">"box"</span>]} <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds]
	<span class="hljs-meta">>>> </span>preds
	[{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.9865</span>,
	<span class="hljs-string">'label'</span>: <span class="hljs-string">'cat'</span>,
	<span class="hljs-string">'box'</span>: {<span class="hljs-string">'xmin'</span>: <span class="hljs-number">178</span>, <span class="hljs-string">'ymin'</span>: <span class="hljs-number">154</span>, <span class="hljs-string">'xmax'</span>: <span class="hljs-number">882</span>, <span class="hljs-string">'ymax'</span>: <span class="hljs-number">598</span>}}]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="image-segmentation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-segmentation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image segmentation</span></h3> <p data-svelte-h="svelte-z5ffax">Image segmentation is a pixel-level task that assigns every pixel in an image to a class. It differs from object detection, which uses bounding boxes to label and predict objects in an image because segmentation is more granular. Segmentation can detect objects at a pixel-level. There are several types of image segmentation:</p> <ul data-svelte-h="svelte-1uq2lxo"><li>instance segmentation: in addition to labeling the class of an object, it also labels each distinct instance of an object (“dog-1”, “dog-2”)</li> <li>panoptic segmentation: a combination of semantic and instance segmentation; it labels each pixel with a semantic class <strong>and</strong> each distinct instance of an object</li></ul> <p data-svelte-h="svelte-fhukdm">Segmentation tasks are helpful in self-driving vehicles to create a pixel-level map of the world around them so they can navigate safely around pedestrians and other vehicles. It is also useful for medical imaging, where the task’s finer granularity can help identify abnormal cells or organ features. Image segmentation can also be used in ecommerce to virtually try on clothes or create augmented reality experiences by overlaying objects in the real world through your camera.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>segmenter = pipeline(task=<span class="hljs-string">"image-segmentation"</span>)
	<span class="hljs-meta">>>> </span>preds = segmenter(
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"</span>
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>preds = [{<span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>), <span class="hljs-string">"label"</span>: pred[<span class="hljs-string">"label"</span>]} <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds]
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(*preds, sep=<span class="hljs-string">"\n"</span>)
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.9879</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'LABEL_184'</span>}
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.9973</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'snow'</span>}
	{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.9972</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'cat'</span>}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="depth-estimation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#depth-estimation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Depth estimation</span></h3> <p data-svelte-h="svelte-ee4yo2">Depth estimation predicts the distance of each pixel in an image from the camera. This computer vision task is especially important for scene understanding and reconstruction. For example, in self-driving cars, vehicles need to understand how far objects like pedestrians, traffic signs, and other vehicles are to avoid obstacles and collisions. Depth information is also helpful for constructing 3D representations from 2D images and can be used to create high-quality 3D representations of biological structures or buildings.</p> <p data-svelte-h="svelte-1tcdxbt">There are two approaches to depth estimation:</p> <ul data-svelte-h="svelte-1h9rsal"><li>stereo: depths are estimated by comparing two images of the same image from slightly different angles</li> <li>monocular: depths are estimated from a single image</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>depth_estimator = pipeline(task=<span class="hljs-string">"depth-estimation"</span>)
	<span class="hljs-meta">>>> </span>preds = depth_estimator(
	<span class="hljs-meta">... </span> <span class="hljs-string">"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"</span>
	<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="natural-language-processing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#natural-language-processing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Natural language processing</span></h2> <p data-svelte-h="svelte-p86gd4">NLP tasks are among the most common types of tasks because text is such a natural way for us to communicate. To get text into a format recognized by a model, it needs to be tokenized. This means dividing a sequence of text into separate words or subwords (tokens) and then converting these tokens into numbers. As a result, you can represent a sequence of text as a sequence of numbers, and once you have a sequence of numbers, it can be input into a model to solve all sorts of NLP tasks!</p> <h3 class="relative group"><a id="text-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text classification</span></h3> <p data-svelte-h="svelte-xkrisb">Like classification tasks in any modality, text classification labels a sequence of text (it can be sentence-level, a paragraph, or a document) from a predefined set of classes. There are many practical applications for text classification, some of which include:</p> <ul data-svelte-h="svelte-1785q2k"><li>sentiment analysis: label text according to some polarity like <code>positive</code> or <code>negative</code> which can inform and support decision-making in fields like politics, finance, and marketing</li> <li>content classification: label text according to some topic to help organize and filter information in news and social media feeds (<code>weather</code>, <code>sports</code>, <code>finance</code>, etc.)</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>classifier = pipeline(task=<span class="hljs-string">"sentiment-analysis"</span>)
	<span class="hljs-meta">>>> </span>preds = classifier(<span class="hljs-string">"Hugging Face is the best thing since sliced bread!"</span>)
	<span class="hljs-meta">>>> </span>preds = [{<span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>), <span class="hljs-string">"label"</span>: pred[<span class="hljs-string">"label"</span>]} <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds]
	<span class="hljs-meta">>>> </span>preds
	[{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.9991</span>, <span class="hljs-string">'label'</span>: <span class="hljs-string">'POSITIVE'</span>}]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="token-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#token-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Token classification</span></h3> <p data-svelte-h="svelte-1ybu6t8">In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. These are known as <a href="glossary#token">tokens</a>. Token classification assigns each token a label from a predefined set of classes.</p> <p data-svelte-h="svelte-1tgdzja">Two common types of token classification are:</p> <ul data-svelte-h="svelte-3b1070"><li>named entity recognition (NER): label a token according to an entity category like organization, person, location or date. NER is especially popular in biomedical settings, where it can label genes, proteins, and drug names.</li> <li>part-of-speech tagging (POS): label a token according to its part-of-speech like noun, verb, or adjective. POS is useful for helping translation systems understand how two identical words are grammatically different (bank as a noun versus bank as a verb).</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>classifier = pipeline(task=<span class="hljs-string">"ner"</span>)
	<span class="hljs-meta">>>> </span>preds = classifier(<span class="hljs-string">"Hugging Face is a French company based in New York City."</span>)
	<span class="hljs-meta">>>> </span>preds = [
	<span class="hljs-meta">... </span> {
	<span class="hljs-meta">... </span> <span class="hljs-string">"entity"</span>: pred[<span class="hljs-string">"entity"</span>],
	<span class="hljs-meta">... </span> <span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>),
	<span class="hljs-meta">... </span> <span class="hljs-string">"index"</span>: pred[<span class="hljs-string">"index"</span>],
	<span class="hljs-meta">... </span> <span class="hljs-string">"word"</span>: pred[<span class="hljs-string">"word"</span>],
	<span class="hljs-meta">... </span> <span class="hljs-string">"start"</span>: pred[<span class="hljs-string">"start"</span>],
	<span class="hljs-meta">... </span> <span class="hljs-string">"end"</span>: pred[<span class="hljs-string">"end"</span>],
	<span class="hljs-meta">... </span> }
	<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds
	<span class="hljs-meta">... </span>]
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(*preds, sep=<span class="hljs-string">"\n"</span>)
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-ORG'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.9968</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'Hu'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">0</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">2</span>}
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-ORG'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.9293</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">2</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'##gging'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">2</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">7</span>}
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-ORG'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.9763</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">3</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'Face'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">8</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">12</span>}
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-MISC'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.9983</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">6</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'French'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">18</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">24</span>}
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-LOC'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.999</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">10</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'New'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">42</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">45</span>}
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-LOC'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.9987</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">11</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'York'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">46</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">50</span>}
	{<span class="hljs-string">'entity'</span>: <span class="hljs-string">'I-LOC'</span>, <span class="hljs-string">'score'</span>: <span class="hljs-number">0.9992</span>, <span class="hljs-string">'index'</span>: <span class="hljs-number">12</span>, <span class="hljs-string">'word'</span>: <span class="hljs-string">'City'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">51</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">55</span>}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Question answering</span></h3> <p data-svelte-h="svelte-jggqzr">Question answering is another token-level task that returns an answer to a question, sometimes with context (open-domain) and other times without context (closed-domain). This task happens whenever we ask a virtual assistant something like whether a restaurant is open. It can also provide customer or technical support and help search engines retrieve the relevant information you’re asking for.</p> <p data-svelte-h="svelte-16zyksn">There are two common types of question answering:</p> <ul data-svelte-h="svelte-11pugpm"><li>extractive: given a question and some context, the answer is a span of text from the context the model must extract</li> <li>abstractive: given a question and some context, the answer is generated from the context; this approach is handled by the <a href="/docs/transformers/pr_33913/en/main_classes/pipelines#transformers.Text2TextGenerationPipeline">Text2TextGenerationPipeline</a> instead of the <a href="/docs/transformers/pr_33913/en/main_classes/pipelines#transformers.QuestionAnsweringPipeline">QuestionAnsweringPipeline</a> shown below</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>question_answerer = pipeline(task=<span class="hljs-string">"question-answering"</span>)
	<span class="hljs-meta">>>> </span>preds = question_answerer(
	<span class="hljs-meta">... </span> question=<span class="hljs-string">"What is the name of the repository?"</span>,
	<span class="hljs-meta">... </span> context=<span class="hljs-string">"The name of the repository is huggingface/transformers"</span>,
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(
	<span class="hljs-meta">... </span> <span class="hljs-string">f"score: <span class="hljs-subst">{<span class="hljs-built_in">round</span>(preds[<span class="hljs-string">'score'</span>], <span class="hljs-number">4</span>)}</span>, start: <span class="hljs-subst">{preds[<span class="hljs-string">'start'</span>]}</span>, end: <span class="hljs-subst">{preds[<span class="hljs-string">'end'</span>]}</span>, answer: <span class="hljs-subst">{preds[<span class="hljs-string">'answer'</span>]}</span>"</span>
	<span class="hljs-meta">... </span>)
	score: <span class="hljs-number">0.9327</span>, start: <span class="hljs-number">30</span>, end: <span class="hljs-number">54</span>, answer: huggingface/transformers<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="summarization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#summarization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Summarization</span></h3> <p data-svelte-h="svelte-4carp">Summarization creates a shorter version of a text from a longer one while trying to preserve most of the meaning of the original document. Summarization is a sequence-to-sequence task; it outputs a shorter text sequence than the input. There are a lot of long-form documents that can be summarized to help readers quickly understand the main points. Legislative bills, legal and financial documents, patents, and scientific papers are a few examples of documents that could be summarized to save readers time and serve as a reading aid.</p> <p data-svelte-h="svelte-1s0vpsm">Like question answering, there are two types of summarization:</p> <ul data-svelte-h="svelte-1lufs1k"><li>extractive: identify and extract the most important sentences from the original text</li> <li>abstractive: generate the target summary (which may include new words not in the input document) from the original text; the <a href="/docs/transformers/pr_33913/en/main_classes/pipelines#transformers.SummarizationPipeline">SummarizationPipeline</a> uses the abstractive approach</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>summarizer = pipeline(task=<span class="hljs-string">"summarization"</span>)
	<span class="hljs-meta">>>> </span>summarizer(
	<span class="hljs-meta">... </span> <span class="hljs-string">"In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."</span>
	<span class="hljs-meta">... </span>)
	[{<span class="hljs-string">'summary_text'</span>: <span class="hljs-string">' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'</span>}]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="translation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#translation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Translation</span></h3> <p data-svelte-h="svelte-1oa82bc">Translation converts a sequence of text in one language to another. It is important in helping people from different backgrounds communicate with each other, help translate content to reach wider audiences, and even be a learning tool to help people learn a new language. Along with summarization, translation is a sequence-to-sequence task, meaning the model receives an input sequence and returns a target output sequence.</p> <p data-svelte-h="svelte-l2a5jq">In the early days, translation models were mostly monolingual, but recently, there has been increasing interest in multilingual models that can translate between many pairs of languages.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>text = <span class="hljs-string">"translate English to French: Hugging Face is a community-based open-source platform for machine learning."</span>
	<span class="hljs-meta">>>> </span>translator = pipeline(task=<span class="hljs-string">"translation"</span>, model=<span class="hljs-string">"google-t5/t5-small"</span>)
	<span class="hljs-meta">>>> </span>translator(text)
	[{<span class="hljs-string">'translation_text'</span>: <span class="hljs-string">"Hugging Face est une tribune communautaire de l'apprentissage des machines."</span>}]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="language-modeling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#language-modeling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Language modeling</span></h3> <p data-svelte-h="svelte-1hrjdjs">Language modeling is a task that predicts a word in a sequence of text. It has become a very popular NLP task because a pretrained language model can be finetuned for many other downstream tasks. Lately, there has been a lot of interest in large language models (LLMs) which demonstrate zero- or few-shot learning. This means the model can solve tasks it wasn’t explicitly trained to do! Language models can be used to generate fluent and convincing text, though you need to be careful since the text may not always be accurate.</p> <p data-svelte-h="svelte-128qp3h">There are two types of language modeling:</p> <ul><li><p data-svelte-h="svelte-cojn0n">causal: the model’s objective is to predict the next token in a sequence, and future tokens are masked</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	<span class="hljs-meta">>>> </span>prompt = <span class="hljs-string">"Hugging Face is a community-based open-source platform for machine learning."</span>
	<span class="hljs-meta">>>> </span>generator = pipeline(task=<span class="hljs-string">"text-generation"</span>)
	<span class="hljs-meta">>>> </span>generator(prompt) <span class="hljs-comment"># doctest: +SKIP</span><!-- HTML_TAG_END --></pre></div></li> <li><p data-svelte-h="svelte-b2em4k">masked: the model’s objective is to predict a masked token in a sequence with full access to the tokens in the sequence</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>text = <span class="hljs-string">"Hugging Face is a community-based open-source <mask> for machine learning."</span>
	<span class="hljs-meta">>>> </span>fill_mask = pipeline(task=<span class="hljs-string">"fill-mask"</span>)
	<span class="hljs-meta">>>> </span>preds = fill_mask(text, top_k=<span class="hljs-number">1</span>)
	<span class="hljs-meta">>>> </span>preds = [
	<span class="hljs-meta">... </span> {
	<span class="hljs-meta">... </span> <span class="hljs-string">"score"</span>: <span class="hljs-built_in">round</span>(pred[<span class="hljs-string">"score"</span>], <span class="hljs-number">4</span>),
	<span class="hljs-meta">... </span> <span class="hljs-string">"token"</span>: pred[<span class="hljs-string">"token"</span>],
	<span class="hljs-meta">... </span> <span class="hljs-string">"token_str"</span>: pred[<span class="hljs-string">"token_str"</span>],
	<span class="hljs-meta">... </span> <span class="hljs-string">"sequence"</span>: pred[<span class="hljs-string">"sequence"</span>],
	<span class="hljs-meta">... </span> }
	<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> pred <span class="hljs-keyword">in</span> preds
	<span class="hljs-meta">... </span>]
	<span class="hljs-meta">>>> </span>preds
	[{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.2236</span>,
	<span class="hljs-string">'token'</span>: <span class="hljs-number">1761</span>,
	<span class="hljs-string">'token_str'</span>: <span class="hljs-string">' platform'</span>,
	<span class="hljs-string">'sequence'</span>: <span class="hljs-string">'Hugging Face is a community-based open-source platform for machine learning.'</span>}]<!-- HTML_TAG_END --></pre></div></li></ul> <h2 class="relative group"><a id="multimodal" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multimodal"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multimodal</span></h2> <p data-svelte-h="svelte-1wn2c65">Multimodal tasks require a model to process multiple data modalities (text, image, audio, video) to solve a particular problem. Image captioning is an example of a multimodal task where the model takes an image as input and outputs a sequence of text describing the image or some properties of the image.</p> <p data-svelte-h="svelte-1ax7w6c">Although multimodal models work with different data types or modalities, internally, the preprocessing steps help the model convert all the data types into embeddings (vectors or list of numbers that holds meaningful information about the data). For a task like image captioning, the model learns relationships between image embeddings and text embeddings.</p> <h3 class="relative group"><a id="document-question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#document-question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Document question answering</span></h3> <p data-svelte-h="svelte-1sl6ve4">Document question answering is a task that answers natural language questions from a document. Unlike a token-level question answering task which takes text as input, document question answering takes an image of a document as input along with a question about the document and returns an answer. Document question answering can be used to parse structured documents and extract key information from it. In the example below, the total amount and change due can be extracted from a receipt.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> PIL <span class="hljs-keyword">import</span> Image
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> requests

	<span class="hljs-meta">>>> </span>url = <span class="hljs-string">"https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"</span>
	<span class="hljs-meta">>>> </span>image = Image.<span class="hljs-built_in">open</span>(requests.get(url, stream=<span class="hljs-literal">True</span>).raw)

	<span class="hljs-meta">>>> </span>doc_question_answerer = pipeline(<span class="hljs-string">"document-question-answering"</span>, model=<span class="hljs-string">"magorshunov/layoutlm-invoices"</span>)
	<span class="hljs-meta">>>> </span>preds = doc_question_answerer(
	<span class="hljs-meta">... </span> question=<span class="hljs-string">"What is the total amount?"</span>,
	<span class="hljs-meta">... </span> image=image,
	<span class="hljs-meta">... </span>)
	<span class="hljs-meta">>>> </span>preds
	[{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.8531</span>, <span class="hljs-string">'answer'</span>: <span class="hljs-string">'17,000'</span>, <span class="hljs-string">'start'</span>: <span class="hljs-number">4</span>, <span class="hljs-string">'end'</span>: <span class="hljs-number">4</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14177uf">Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. In the next <a href="tasks_explained">section</a>, you’ll learn <strong>how</strong> 🤗 Transformers work to solve these tasks.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/task_summary.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_z647wz = {
	assets: "/docs/transformers/pr_33913/en",
	base: "/docs/transformers/pr_33913/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"),
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 414],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 83.6 kB
Xet hash:: 1204eb11326bb9ca837c09b3eca10005447461fc402cb0870216655f0ccf1528

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.