Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / computer-vision-course /pr_397 /en /unit13 /retention.html

rtrm

about 1 month ago

download

raw

17.3 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Retention In Vision","local":"retention-in-vision","sections":[{"title":"What are Retention Networks","local":"what-are-retention-networks","sections":[],"depth":2},{"title":"From Language to Image","local":"from-language-to-image","sections":[{"title":"RMT","local":"rmt","sections":[],"depth":3},{"title":"ViR","local":"vir","sections":[],"depth":3}],"depth":2},{"title":"Further Reading","local":"further-reading","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/computer-vision-course/pr_397/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/scheduler.7bc62968.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/singletons.b15acae1.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/paths.11cdc4b4.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.2f8492b0.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/0.e37092e8.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/nodes/31.4cd39928.js">
	<link rel="modulepreload" href="/docs/computer-vision-course/pr_397/en/_app/immutable/chunks/index.514d62da.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Retention In Vision","local":"retention-in-vision","sections":[{"title":"What are Retention Networks","local":"what-are-retention-networks","sections":[],"depth":2},{"title":"From Language to Image","local":"from-language-to-image","sections":[{"title":"RMT","local":"rmt","sections":[],"depth":3},{"title":"ViR","local":"vir","sections":[],"depth":3}],"depth":2},{"title":"Further Reading","local":"further-reading","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="retention-in-vision" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#retention-in-vision"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Retention In Vision</span></h1> <h2 class="relative group"><a id="what-are-retention-networks" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-are-retention-networks"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What are Retention Networks</span></h2> <p data-svelte-h="svelte-1c1c5ur">Retentive Network (RetNet) is a foundational architecture proposed for large language models in the paper <a href="https://arxiv.org/abs/2307.08621" rel="nofollow">Retentive Network: A Successor to Transformer for Large Language Models</a>. This architecture is designed to address key challenges in the realm of large-scale language modeling: training parallelism, low-cost inference, and good performance.</p> <p data-svelte-h="svelte-1jgrh66"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/LLM%20Challenges.png" alt="LLM Challenges">
	RetNet is able to tackle these challenges by introducing the Multi-Scale Retention (MSR) mechanism, which is an alternative to the multi-head attention mechanism commonly used in Transformer models.
	MSR has a dual form of recurrence and parallelism, so it is possible to train the models in a parallel way while recurrently conducting inference. We will explore RetNet in detail in the later chapter.</p> <p data-svelte-h="svelte-buos82">The Multi-Scale Retention mechanism operates under three computation paradigms:</p> <ul data-svelte-h="svelte-y1lwo5"><li><p><strong>Parallel Representation:</strong> This aspect of RetNet is designed similar to self-attention in Transformer, where it enables us train the models with GPUs efficiently.</p></li> <li><p><strong>Recurrent Representation:</strong> This representation facilitates efficient inference with O(1) complexity in terms of memory and computational requirements. It significantly reduces deployment costs and latency, and simplifies implementation by eliminating the need for key-value cache strategies often used in traditional models.</p></li> <li><p><strong>Chunkwise Recurrent Representation:</strong> This third paradigm addresses the challenge of long-sequence modeling. It achieves this by encoding each local block in parallel for computational speed while simultaneously and recurrently encoding global blocks to optimize GPU memory usage.</p></li></ul> <p data-svelte-h="svelte-f19fw6">During the training phase, the approach incorporates both parallel and chunkwise recurrent representations, optimizing GPU usage for fast computation and being particularly effective for long sequences in terms of computational efficiency and memory use.
	For the inference phase, the recurrent representation is used, favoring autoregressive decoding. This method efficiently reduces memory usage and latency, maintaining equivalent performance outcomes.</p> <h2 class="relative group"><a id="from-language-to-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#from-language-to-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>From Language to Image</span></h2> <h3 class="relative group"><a id="rmt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rmt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>RMT</span></h3> <p data-svelte-h="svelte-1fau2a5">The paper <a href="https://arxiv.org/abs/2309.11523" rel="nofollow">RMT: Retentive Networks Meet Vision Transformers</a> proposes a new vision backbone inspired by the RetNet architecture. The authors propose RMT to enhance the Vision Transformer (ViT) by introducing explicit spatial priors and reducing computational complexity, drawing inspiration from the RetNet’s parallel representation.
	This includes adapting the RetNet’s temporal decay to spatial domains and using a <a href="https://en.wikipedia.org/wiki/Taxicab_geometry" rel="nofollow">Manhattan distance-based</a> spatial decay matrix, along with an attention decomposition form, to improve efficiency and scalability in vision tasks.</p> <ul data-svelte-h="svelte-1w70jkh"><li><p><strong>Manhattan Self-Attention (MaSA)</strong> <img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/Attention%20Comparison.png" alt="Attention Comparison">
	MaSA incorporates Self-Attention mechanism with a two-dimensional bidirectional spatial decay matrix based on the Manhattan distance among the tokens. This matrix decreases attention scores for tokens further away from a target token, allowing it to perceive global information while varying attention based on distance.</p></li> <li><p><strong>Decomposed Manhattan Self-Attention (MaSAD)</strong> <img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/MaSAD.png" alt="MaSAD">
	This mechanism decomposes Self-Attention in images along horizontal and vertical axes of the image, maintaining the spatial decay matrix without losing prior information. This decomposition allows the Manhattan Self-Attention (MaSA) to model global information efficiently with linear complexity, while preserving the original MaSA’s receptive field shape.</p></li></ul> <p data-svelte-h="svelte-17h5zjt">However, unlike the original RetNet, which performs training with parallel representation and inference with recurrent representation, RMT does both processes with the MaSA mechanism. The authors have made comparisons between MaSA and other RetNet’s representations, and they show that MaSA has the best throughput with the highest accuracy.
	<img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/MaSA%20vs%20Retention.png" alt="MaSA vs Retention"></p> <h3 class="relative group"><a id="vir" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#vir"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ViR</span></h3> <p data-svelte-h="svelte-1guwj4s"><img src="https://huggingface.co/datasets/hf-vision/course-assets/resolve/main/ViR.png" alt="ViR"></p> <p data-svelte-h="svelte-10x61oz">Another work inspired by the RetNet architecture is the ViR, as discussed in the paper <a href="http://arxiv.org/abs/2310.19731" rel="nofollow">ViR: Vision Retention Networks</a>. In this architecture, the authors propose a general vision backbone with a redesigned retention mechanism. They demonstrate that ViR can scale favorably to larger image resolutions in terms of image throughput and memory consumption by leveraging the dual parallel and recurrent properties of the retentive network.</p> <p data-svelte-h="svelte-1cm9cc0">The overall architecture of ViR is quite similar to that of ViT, except that it replaces the Multi-Head Attention (MHA) with Multi-Head Retention (MHR). This MHR mechanism is free of any gating function and can be switched between parallel, recurrent, or chunkwise (a hybrid between parallel and recurrent) modes. Another difference in ViR is that the positional embedding is first added to the patch embedding, and then the [class] token is appended.</p> <h2 class="relative group"><a id="further-reading" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#further-reading"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Further Reading</span></h2> <ul data-svelte-h="svelte-m1gqrb"><li><a href="https://github.com/microsoft/torchscale/blob/main/torchscale/architecture/retnet.py" rel="nofollow">RetNet’s official repo</a></li> <li><a href="https://github.com/microsoft/torchscale/blob/main/torchscale/component/multiscale_retention.py" rel="nofollow">RetNet’s Multi-Scale Retention official repo</a></li> <li><a href="https://medium.com/ai-fusion-labs/retentive-networks-retnet-explained-the-much-awaited-transformers-killer-is-here-6c17e3e8add8" rel="nofollow">Retentive Networks (RetNet) Explained: The much-awaited Transformers-killer is here</a></li> <li><a href="https://www.youtube.com/watch?v=ec56a8wmfRk" rel="nofollow">Retentive Network: A Successor to Transformer for Large Language Models (Paper Explained)</a></li> <li><a href="https://github.com/qhfan/RMT" rel="nofollow">RMT’s official repo</a></li> <li><a href="https://github.com/NVlabs/ViR" rel="nofollow">ViR’s official repo</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/computer-vision-course/blob/main/chapters/en/unit13/retention.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1p6gie1 = {
	assets: "/docs/computer-vision-course/pr_397/en",
	base: "/docs/computer-vision-course/pr_397/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/start.7f209408.js"),
	import("/docs/computer-vision-course/pr_397/en/_app/immutable/entry/app.32e8338e.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 31],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 17.3 kB
Xet hash:: 4d0dc47526c9b0f6500efcba219f0fb079e2b2689f443fc7d3b25eecbe5ce071

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.