Buckets:

rtrm's picture
download
raw
17.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Overview&quot;,&quot;local&quot;:&quot;overview&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;What is Hiera?&quot;,&quot;local&quot;:&quot;what-is-hiera&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;From CNNs to ViTs&quot;,&quot;local&quot;:&quot;from-cnns-to-vits&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Hiera’s Approach: Pretraining Task is All You Need&quot;,&quot;local&quot;:&quot;hieras-approach-pretraining-task-is-all-you-need&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Simplifying MViTv2&quot;,&quot;local&quot;:&quot;simplifying-mvitv2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Masked Autoencoder&quot;,&quot;local&quot;:&quot;masked-autoencoder&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}">
<link href="/docs/computer-vision-course/pr_397/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/scheduler.7bc62968.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/singletons.b15acae1.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/paths.11cdc4b4.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.2f8492b0.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/0.e37092e8.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/28.49090f5f.js">
<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.514d62da.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Overview&quot;,&quot;local&quot;:&quot;overview&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;What is Hiera?&quot;,&quot;local&quot;:&quot;what-is-hiera&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;From CNNs to ViTs&quot;,&quot;local&quot;:&quot;from-cnns-to-vits&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Hiera’s Approach: Pretraining Task is All You Need&quot;,&quot;local&quot;:&quot;hieras-approach-pretraining-task-is-all-you-need&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Simplifying MViTv2&quot;,&quot;local&quot;:&quot;simplifying-mvitv2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Masked Autoencoder&quot;,&quot;local&quot;:&quot;masked-autoencoder&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h2 class="relative group"><a id="overview" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#overview"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Overview</span></h2> <h3 class="relative group"><a id="what-is-hiera" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-is-hiera"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What is Hiera?</span></h3> <p data-svelte-h="svelte-164bo80"><a href="https://arxiv.org/abs/2306.00989" rel="nofollow">Hiera</a> (Hierarchical Vision Transformer) is an architecture that achieves high accuracy without the need for specialized components found in other vision models. The authors propose pretraining Hiera with a strong visual pretext task to remove unnecessary complexity and create a faster and more accurate model.</p> <p data-svelte-h="svelte-4tfkgy"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/hiera_images/hiera_architecture.png" alt="Hiera Architecture"></p> <h3 class="relative group"><a id="from-cnns-to-vits" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#from-cnns-to-vits"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>From CNNs to ViTs</span></h3> <p data-svelte-h="svelte-1ndpe1f">CNNs and hierarchical models are well-suited for computer vision tasks because they can effectively capture the hierarchical and spatial structure of visual data. These models use fewer channels but higher spatial resolution in the early stages to extract simpler features and more channels but lower spatial resolution in the later stages to extract more complex features.</p> <p data-svelte-h="svelte-1szpz9z"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/hiera_images/CNN_architecture.webp" alt="CNNs"></p> <p data-svelte-h="svelte-10y31yr">On the other hand, Vision Transformers (ViTs) are more accurate, scalable, and architecturally simple models that took computer vision by storm when they were introduced. However, this simplicity comes at a cost: they lack this “vision inductive bias” (their architecture is not designed to work specifically with visual data).</p> <p data-svelte-h="svelte-1q5bgys">Many efforts have been made to adapt ViTs, generally by adding hierarchical components to compensate for this lack of inductive bias in their architecture. Unfortunately, all of the resulting models turned out to be slower, bigger and more difficult to scale.</p> <h3 class="relative group"><a id="hieras-approach-pretraining-task-is-all-you-need" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#hieras-approach-pretraining-task-is-all-you-need"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Hiera’s Approach: Pretraining Task is All You Need</span></h3> <p data-svelte-h="svelte-5danx">Authors of the Hiera paper argue that a ViT model can learn spatial reasoning and perform well on computer vision tasks by using a strong visual pretext task called MAE and thus, they can remove unnecessary components and complexity from state-of-the-art multi-stage vision transformers to achieve greater accuracy and speed.</p> <p data-svelte-h="svelte-6sxof8">What components are the paper authors actually removing? To understand this we first have to introduce <a href="https://arxiv.org/abs/2112.01526" rel="nofollow">MViTv2</a> which is the base hierarchical architecture from which Hiera is derived. MViTv2 learns multi-scale representations over its four stages: it starts by modeling low level features with a small channel capacity but high spatial resolution, and then in each stage trades channel capacity for spatial resolution to model more complex high-level features in deeper layers.</p> <p data-svelte-h="svelte-16cjei5"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/hiera_images/mvitv2.png" alt="MViTv2"></p> <p data-svelte-h="svelte-1uqfygm">Instead of digging deep into MViTv2’s key features (since it’s not our main topic), we will breifly explain them in the next section to illustrate how researchers create Hiera by simplyfing this base architecture.</p> <h3 class="relative group"><a id="simplifying-mvitv2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#simplifying-mvitv2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Simplifying MViTv2</span></h3> <p data-svelte-h="svelte-1mu6sph"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/hiera_images/hiera_changes.png" alt="Simplifying MViTv2"></p> <p data-svelte-h="svelte-q7z0qq">This table lists all the changes that authors made to MViTv2 in order to create Hiera and how each change affects accuracy and speed on images and video.</p> <ul data-svelte-h="svelte-1ka9l16"><li><strong>Replacing relative with absolute position embeddings</strong>: MViTv2 swaps the absolute position embeddings from the original <a href="https://arxiv.org/abs/2010.11929" rel="nofollow">ViT</a> paper with relative ones added to attention in each block. Authors undo this change because it added more complexity to the model and, as it can be seen in the table, these relative position embeddings are not necessary when training with MAE (both accuracy and speed improve with this change).</li> <li><strong>Removing convolutions</strong>: Since the key idea of the paper is that a model can learn spatial biases by pretraining with a strong visual pretext task, removing convolutions, which are vision specific modules and add potentially unnecessary overhead seems to be an important change. Authors first replace every conv with a max pooling layer which decreases accuracy at first because of the huge impact it has on the image features. However, they realize that they can remove some of these extra max pooling layers, specifically the ones with a stride of 1 since they are basically applying a ReLU on every feature map. By doing so, authors nearly returned to the accuracy they had before, while speeding up the model by 22% for images and 27% for video.</li></ul> <h3 class="relative group"><a id="masked-autoencoder" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#masked-autoencoder"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Masked Autoencoder</span></h3> <p data-svelte-h="svelte-k259cb">Masked Autoencoder (MAE) is an unsupervised training paradigm. As with any other autoencoder, it consists of encoding high dimensional data (images) into a lower dimension representation (embeddings) in such a way that this data can be decoded into the original high dimensional data again. However, the visual MAE technique consists of dropping a certain amount of patches (around 75%), encoding the rest of the patches, and then trying to predict the missing ones. This idea has been used extensively in recent years as a pre-training task for image encoders.</p> <p data-svelte-h="svelte-hlbfp7"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/hiera_images/mae.png" alt="MAE"></p> <h2 class="relative group"><a id="why-does-hiera-matter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-does-hiera-matter"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why Does Hiera Matter?</span></h2> <p data-svelte-h="svelte-18jmdfi">In the era dominated by transformer models, there are still a lot of attempts to improve this simple architecture, adding the complexity of CNN to convert them into hierarchical models again. Although hierarchical models excel in computer vision, this study demonstrates that achieving hierarchical transformers doesn’t necessitate intricate architectural modifications. Instead, a concentrated emphasis on the training task alone can yield simple, fast, and precise models.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/computer-vision-course/blob/main/chapters/en/unit13/hiera.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1p6gie1 = {
assets: "/docs/computer-vision-course/pr_397/en",
base: "/docs/computer-vision-course/pr_397/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js"),
import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 28],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
17.8 kB
·
Xet hash:
0b32ec2bd33d544818537a3a40b16bd6278f2fb45092ce2e2e6c10a8e9ee6d02

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.