Buckets:

rtrm's picture
download
raw
27.7 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Handling big models for inference&quot;,&quot;local&quot;:&quot;handling-big-models-for-inference&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Using 🤗 Accelerate&quot;,&quot;local&quot;:&quot;using--accelerate&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Complete Example&quot;,&quot;local&quot;:&quot;complete-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using 🤗 Transformers, 🤗 Diffusers, and other 🤗 Open Source Libraries&quot;,&quot;local&quot;:&quot;using--transformers--diffusers-and-other--open-source-libraries&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Where to go from here&quot;,&quot;local&quot;:&quot;where-to-go-from-here&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/accelerate/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/entry/start.2ea03080.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/scheduler.defa9a21.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/singletons.aff0b9fc.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/index.beade68d.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/paths.2c85d1a6.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/entry/app.e6812672.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/index.fe795e71.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/nodes/0.39c84d5d.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/nodes/35.f4c7a773.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/Tip.179eb360.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/Youtube.73056ec3.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/CodeBlock.42404125.js">
<link rel="modulepreload" href="/docs/accelerate/main/en/_app/immutable/chunks/EditOnGithub.0f575778.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Handling big models for inference&quot;,&quot;local&quot;:&quot;handling-big-models-for-inference&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Using 🤗 Accelerate&quot;,&quot;local&quot;:&quot;using--accelerate&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Complete Example&quot;,&quot;local&quot;:&quot;complete-example&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using 🤗 Transformers, 🤗 Diffusers, and other 🤗 Open Source Libraries&quot;,&quot;local&quot;:&quot;using--transformers--diffusers-and-other--open-source-libraries&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Where to go from here&quot;,&quot;local&quot;:&quot;where-to-go-from-here&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="handling-big-models-for-inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#handling-big-models-for-inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Handling big models for inference</span></h1> <p data-svelte-h="svelte-1y1xrx1">One of the biggest advancements 🤗 Accelerate provides is the concept of <a href="../concept_guides/big_model_inference">large model inference</a> wherein you can perform <em>inference</em> on models that cannot fully fit on your graphics card.</p> <p data-svelte-h="svelte-wbf5nd">This tutorial will be broken down into two parts showcasing how to use both 🤗 Accelerate and 🤗 Transformers (a higher API-level) to make use of this idea.</p> <h2 class="relative group"><a id="using--accelerate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using--accelerate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using 🤗 Accelerate</span></h2> <p data-svelte-h="svelte-vbpybg">For these tutorials, we’ll assume a typical workflow for loading your model in such that:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
my_model = ModelClass(...)
state_dict = torch.load(checkpoint_file)
my_model.load_state_dict(state_dict)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jxa9oh">Note that here we assume that <code>ModelClass</code> is a model that takes up more video-card memory than what can fit on your device (be it <code>mps</code> or <code>cuda</code>).</p> <p data-svelte-h="svelte-1hs87yh">The first step is to init an empty skeleton of the model which won’t take up any RAM using the <a href="/docs/accelerate/main/en/package_reference/big_modeling#accelerate.init_empty_weights">init_empty_weights()</a> context manager:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> init_empty_weights
<span class="hljs-keyword">with</span> init_empty_weights():
my_model = ModelClass(...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ghbhr2">With this <code>my_model</code> currently is “parameterless”, hence leaving the smaller footprint than what one would normally get loading this onto the CPU directly.</p> <p data-svelte-h="svelte-1uet975">Next we need to load in the weights to our model so we can perform inference.</p> <p data-svelte-h="svelte-14hc8p7">For this we will use <a href="/docs/accelerate/main/en/package_reference/big_modeling#accelerate.load_checkpoint_and_dispatch">load_checkpoint_and_dispatch()</a>, which as the name implies will load a checkpoint inside your empty model and dispatch the weights for each layer across all the devices you have available (GPU/MPS and CPU RAM).</p> <p data-svelte-h="svelte-fm5ztl">To determine how this <code>dispatch</code> can be performed, generally specifying <code>device_map=&quot;auto&quot;</code> will be good enough as 🤗 Accelerate
will attempt to fill all the space in your GPU(s), then loading them to the CPU, and finally if there is not enough RAM it will be loaded to the disk (the absolute slowest option).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-mwnzh4">For more details on designing your own device map, see this section of the <a href="../concept_guides/big_model_inference#designing-a-device-map">concept guide</a></p></div> <p data-svelte-h="svelte-kl0aht">See an example below:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> load_checkpoint_and_dispatch
model = load_checkpoint_and_dispatch(
model, checkpoint=checkpoint_file, device_map=<span class="hljs-string">&quot;auto&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1v1st9m">If there are certain “chunks” of layers that shouldn’t be split, you can pass them in as <code>no_split_module_classes</code>. Read more about it <a href="../concept_guides/big_model_inference#loading-weights">here</a></p></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1j3j2dw">Also to save on memory (such as if the <code>state_dict</code> will not fit in RAM), a model’s weights can be divided and split into multiple checkpoint files. Read more about it <a href="../concept_guides/big_model_inference#sharded-checkpoints">here</a></p></div> <p data-svelte-h="svelte-1r19r04">Now that the model is dispatched fully, you can perform inference as normal with the model:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">input</span> = torch.randn(<span class="hljs-number">2</span>,<span class="hljs-number">3</span>)
<span class="hljs-built_in">input</span> = <span class="hljs-built_in">input</span>.to(<span class="hljs-string">&quot;cuda&quot;</span>)
output = model(<span class="hljs-built_in">input</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rk6ubd">What will happen now is each time the input gets passed through a layer, it will be sent from the CPU to the GPU (or disk to CPU to GPU), the output is calculated, and then the layer is pulled back off the GPU going back down the line. While this adds some overhead to the inference being performed, through this method it is possible to run <strong>any size model</strong> on your system, as long as the largest layer is capable of fitting on your GPU.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1i63vy9">Multiple GPUs can be utilized, however this is considered “model parallelism” and as a result only one GPU will be active at a given moment, waiting for the prior one to send it the output. You should launch your script normally with <code>python</code>
and not need <code>torchrun</code>, <code>accelerate launch</code>, etc.</p></div> <p data-svelte-h="svelte-zm6t1c">For a visual representation of this, check out the animation below:</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/MWCSGj9jEAo" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <h3 class="relative group"><a id="complete-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#complete-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Complete Example</span></h3> <p data-svelte-h="svelte-js0stu">Below is the full example showcasing what we performed above:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> accelerate <span class="hljs-keyword">import</span> init_empty_weights, load_checkpoint_and_dispatch
<span class="hljs-keyword">with</span> init_empty_weights():
model = MyModel(...)
model = load_checkpoint_and_dispatch(
model, checkpoint=checkpoint_file, device_map=<span class="hljs-string">&quot;auto&quot;</span>
)
<span class="hljs-built_in">input</span> = torch.randn(<span class="hljs-number">2</span>,<span class="hljs-number">3</span>)
<span class="hljs-built_in">input</span> = <span class="hljs-built_in">input</span>.to(<span class="hljs-string">&quot;cuda&quot;</span>)
output = model(<span class="hljs-built_in">input</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="using--transformers--diffusers-and-other--open-source-libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using--transformers--diffusers-and-other--open-source-libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using 🤗 Transformers, 🤗 Diffusers, and other 🤗 Open Source Libraries</span></h2> <p data-svelte-h="svelte-136pb64">Libraries that support 🤗 Accelerate big model inference include all of the earlier logic in their <code>from_pretrained</code> constructors.</p> <p data-svelte-h="svelte-dfcuzq">These operate by specifying a string representing the model to download from the <a href="https://hf.co/models" rel="nofollow">🤗 Hub</a> and then denoting <code>device_map=&quot;auto&quot;</code> along with a few extra parameters.</p> <p data-svelte-h="svelte-1p0xr7h">As a brief example, we will look at using <code>transformers</code> and loading in Big Science’s T0pp model.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(<span class="hljs-string">&quot;bigscience/T0pp&quot;</span>, device_map=<span class="hljs-string">&quot;auto&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v76uzj">After loading the model in, the initial steps from before to prepare a model have all been done and the model is fully
ready to make use of all the resources in your machine. Through these constructors, you can also save <em>more</em> memory by
specifying the precision the model is loaded into as well, through the <code>torch_dtype</code> parameter, such as:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(<span class="hljs-string">&quot;bigscience/T0pp&quot;</span>, device_map=<span class="hljs-string">&quot;auto&quot;</span>, torch_dtype=torch.float16)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-25xm0x">To learn more about this, check out the 🤗 Transformers documentation available <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#large-model-loading" rel="nofollow">here</a>.</p> <h2 class="relative group"><a id="where-to-go-from-here" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#where-to-go-from-here"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Where to go from here</span></h2> <p data-svelte-h="svelte-5fb45b">For a much more detailed look at big model inference, be sure to check out the <a href="../concept_guides/big_model_inference">Conceptual Guide on it</a></p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/accelerate/blob/main/docs/source/usage_guides/big_modeling.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1fyccrg = {
assets: "/docs/accelerate/main/en",
base: "/docs/accelerate/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/accelerate/main/en/_app/immutable/entry/start.2ea03080.js"),
import("/docs/accelerate/main/en/_app/immutable/entry/app.e6812672.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 35],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
27.7 kB
·
Xet hash:
970e19c394f0fdd1c7f5ab7ceed783c55b0662ce73bd22d3c64eff4978cbfac4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.