Buckets:

hf-doc-build/doc-dev / transformers /main /it /perf_infer_gpu_one.html
rtrm's picture
download
raw
26 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Inferenza efficiente su GPU singola&quot;,&quot;local&quot;:&quot;inferenza-efficiente-su-gpu-singola&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;BetterTransformer per l’inferenza più veloce&quot;,&quot;local&quot;:&quot;bettertransformer-per-linferenza-più-veloce&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Integrazione di bitsandbytes per Int8 mixed-precision matrix decomposition&quot;,&quot;local&quot;:&quot;integrazione-di-bitsandbytes-per-int8-mixed-precision-matrix-decomposition&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Requisiti&quot;,&quot;local&quot;:&quot;requisiti&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Esecuzione di modelli mixed-Int8 - configurazione per singola GPU&quot;,&quot;local&quot;:&quot;esecuzione-di-modelli-mixed-int8---configurazione-per-singola-gpu&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Esecuzione di modelli mixed-8bit - configurazione multi GPU&quot;,&quot;local&quot;:&quot;esecuzione-di-modelli-mixed-8bit---configurazione-multi-gpu&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Colab demos&quot;,&quot;local&quot;:&quot;colab-demos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/it/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/entry/start.dac10038.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/scheduler.36a0863c.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/singletons.d5d832bc.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/index.733708bb.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/paths.d03c7e87.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/entry/app.354f2351.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/index.9c13489a.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/nodes/0.b7cdb3fe.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/nodes/20.db84ea44.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/Tip.3b06990e.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/CodeBlock.05d8ec32.js">
<link rel="modulepreload" href="/docs/transformers/main/it/_app/immutable/chunks/EditOnGithub.e88f2b7b.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Inferenza efficiente su GPU singola&quot;,&quot;local&quot;:&quot;inferenza-efficiente-su-gpu-singola&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;BetterTransformer per l’inferenza più veloce&quot;,&quot;local&quot;:&quot;bettertransformer-per-linferenza-più-veloce&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Integrazione di bitsandbytes per Int8 mixed-precision matrix decomposition&quot;,&quot;local&quot;:&quot;integrazione-di-bitsandbytes-per-int8-mixed-precision-matrix-decomposition&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Requisiti&quot;,&quot;local&quot;:&quot;requisiti&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Esecuzione di modelli mixed-Int8 - configurazione per singola GPU&quot;,&quot;local&quot;:&quot;esecuzione-di-modelli-mixed-int8---configurazione-per-singola-gpu&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Esecuzione di modelli mixed-8bit - configurazione multi GPU&quot;,&quot;local&quot;:&quot;esecuzione-di-modelli-mixed-8bit---configurazione-multi-gpu&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Colab demos&quot;,&quot;local&quot;:&quot;colab-demos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="inferenza-efficiente-su-gpu-singola" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inferenza-efficiente-su-gpu-singola"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inferenza efficiente su GPU singola</span></h1> <p data-svelte-h="svelte-nhozkm">Questo documento sarà presto completato con informazioni su come effetture l’inferenza su una singola GPU. Nel frattempo è possibile consultare <a href="perf_train_gpu_one">la guida per l’addestramento su una singola GPU</a> e <a href="perf_infer_cpu">la guida per l’inferenza su CPU</a>.</p> <h2 class="relative group"><a id="bettertransformer-per-linferenza-più-veloce" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#bettertransformer-per-linferenza-più-veloce"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>BetterTransformer per l’inferenza più veloce</span></h2> <p data-svelte-h="svelte-1z110ap">Abbiamo recentemente integrato <code>BetterTransformer</code> per velocizzare l’inferenza su GPU per modelli di testo, immagini e audio. Per maggiori dettagli, consultare la documentazione su questa integrazione <a href="https://huggingface.co/docs/optimum/bettertransformer/overview" rel="nofollow">qui</a>.</p> <h2 class="relative group"><a id="integrazione-di-bitsandbytes-per-int8-mixed-precision-matrix-decomposition" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#integrazione-di-bitsandbytes-per-int8-mixed-precision-matrix-decomposition"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Integrazione di bitsandbytes per Int8 mixed-precision matrix decomposition</span></h2> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-vzcdbv">Nota che questa funzione può essere utilizzata anche nelle configurazioni multi GPU.</p></div> <p data-svelte-h="svelte-om1tv8">Dal paper <a href="https://arxiv.org/abs/2208.07339" rel="nofollow"><code>LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale</code></a>, noi supportiamo l’integrazione di Hugging Face per tutti i modelli dell’Hub con poche righe di codice.
Il metodo <code>nn.Linear</code> riduce la dimensione di 2 per i pesi <code>float16</code> e <code>bfloat16</code> e di 4 per i pesi <code>float32</code>, con un impatto quasi nullo sulla qualità, operando sugli outlier in half-precision.</p> <p data-svelte-h="svelte-1tsrdqi"><img src="https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png" alt="HFxbitsandbytes.png"></p> <p data-svelte-h="svelte-1hgrk11">Il metodo Int8 mixed-precision matrix decomposition funziona separando la moltiplicazione tra matrici in due flussi: (1) una matrice di flusso di outlier di caratteristiche sistematiche moltiplicata in fp16, (2) in flusso regolare di moltiplicazione di matrici int8 (99,9%). Con questo metodo, è possibile effettutare inferenza int8 per modelli molto grandi senza degrado predittivo.
Per maggiori dettagli sul metodo, consultare il <a href="https://arxiv.org/abs/2208.07339" rel="nofollow">paper</a> o il nostro <a href="https://huggingface.co/blog/hf-bitsandbytes-integration" rel="nofollow">blogpost sull’integrazione</a>.</p> <p data-svelte-h="svelte-y2mgdg"><img src="https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif" alt="MixedInt8.gif"></p> <p data-svelte-h="svelte-1b4hp8z">Nota che è necessaria una GPU per eseguire modelli di tipo mixed-8bit, poiché i kernel sono stati compilati solo per le GPU. Prima di utilizzare questa funzione, assicurarsi di disporre di memoria sufficiente sulla GPU per memorizzare un quarto del modello (o la metà se i pesi del modello sono in mezza precisione).
Di seguito sono riportate alcune note per aiutarvi a utilizzare questo modulo, oppure seguite le dimostrazioni su <a href="#colab-demos">Google colab</a>.</p> <h3 class="relative group"><a id="requisiti" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#requisiti"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Requisiti</span></h3> <ul data-svelte-h="svelte-50hzsq"><li>Se si dispone di <code>bitsandbytes&lt;0.37.0</code>, assicurarsi di eseguire su GPU NVIDIA che supportano tensor cores a 8 bit (Turing, Ampere o architetture più recenti - ad esempio T4, RTX20s RTX30s, A40-A100). Per <code>bitsandbytes&gt;=0.37.0</code>, tutte le GPU dovrebbero essere supportate.</li> <li>Installare la versione corretta di <code>bitsandbytes</code> eseguendo:
<code>pip install bitsandbytes&gt;=0.31.5</code>.</li> <li>Installare <code>accelerate</code> <code>pip install accelerate&gt;=0.12.0</code></li></ul> <h3 class="relative group"><a id="esecuzione-di-modelli-mixed-int8---configurazione-per-singola-gpu" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#esecuzione-di-modelli-mixed-int8---configurazione-per-singola-gpu"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Esecuzione di modelli mixed-Int8 - configurazione per singola GPU</span></h3> <p data-svelte-h="svelte-14t427w">Dopo aver installato le librerie necessarie, per caricare il tuo modello mixed 8-bit è il seguente:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, BitsAndBytesConfig
model_name = <span class="hljs-string">&quot;bigscience/bloom-2b5&quot;</span>
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7gz38r">Per la generazione di testo, si consiglia di:</p> <ul data-svelte-h="svelte-drw5lu"><li>utilizzare il metodo <code>generate()</code> del modello invece della funzione <code>pipeline()</code>. Sebbene l’inferenza sia possibile con la funzione <code>pipeline()</code>, essa non è ottimizzata per i modelli mixed-8bit e sarà più lenta rispetto all’uso del metodo <code>generate()</code>. Inoltre, alcune strategie di campionamento, come il campionamento nucleaus, non sono supportate dalla funzione <code>pipeline()</code> per i modelli mixed-8bit.</li> <li>collocare tutti gli ingressi sullo stesso dispositivo del modello.</li></ul> <p data-svelte-h="svelte-1s7g1s7">Ecco un semplice esempio:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
model_name = <span class="hljs-string">&quot;bigscience/bloom-2b5&quot;</span>
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>))
text = <span class="hljs-string">&quot;Hello, my llama is cute&quot;</span>
inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
generated_ids = model.generate(**inputs)
outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="esecuzione-di-modelli-mixed-8bit---configurazione-multi-gpu" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#esecuzione-di-modelli-mixed-8bit---configurazione-multi-gpu"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Esecuzione di modelli mixed-8bit - configurazione multi GPU</span></h3> <p data-svelte-h="svelte-df9yha">Usare il seguente modo caricare il modello mixed-8bit su più GPU (stesso comando della configurazione a GPU singola):</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model_name = <span class="hljs-string">&quot;bigscience/bloom-2b5&quot;</span>
model_8bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=BitsAndBytesConfig(load_in_8bit=<span class="hljs-literal">True</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qhnznq">Puoi controllare la RAM della GPU che si vuole allocare su ogni GPU usando <code>accelerate</code>. Utilizzare l’argomento <code>max_memory</code> come segue:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->max_memory_mapping = {<span class="hljs-number">0</span>: <span class="hljs-string">&quot;1GB&quot;</span>, <span class="hljs-number">1</span>: <span class="hljs-string">&quot;2GB&quot;</span>}
model_name = <span class="hljs-string">&quot;bigscience/bloom-3b&quot;</span>
model_8bit = AutoModelForCausalLM.from_pretrained(
model_name, device_map=<span class="hljs-string">&quot;auto&quot;</span>, load_in_8bit=<span class="hljs-literal">True</span>, max_memory=max_memory_mapping
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-80anhf">In questo esempio, la prima GPU utilizzerà 1 GB di memoria e la seconda 2 GB.</p> <h3 class="relative group"><a id="colab-demos" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#colab-demos"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Colab demos</span></h3> <p data-svelte-h="svelte-e4lo0z">Con questo metodo è possibile inferire modelli che prima non era possibile inferire su Google Colab.
Guardate la demo per l’esecuzione di T5-11b (42GB in fp32)! Utilizzo la quantizzazione a 8 bit su Google Colab:</p> <p data-svelte-h="svelte-1yb5ek4"><a href="https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab: T5-11b demo"></a></p> <p data-svelte-h="svelte-1u8p947">Oppure questa demo di BLOOM-3B:</p> <p data-svelte-h="svelte-6z7881"><a href="https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab: BLOOM-3b demo"></a></p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/it/perf_infer_gpu_one.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1h8xam7 = {
assets: "/docs/transformers/main/it",
base: "/docs/transformers/main/it",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/it/_app/immutable/entry/start.dac10038.js"),
import("/docs/transformers/main/it/_app/immutable/entry/app.354f2351.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 20],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
26 kB
·
Xet hash:
52f3ac4fba8e7d373526be6750808a64425c585a76304ef5d387913022f8450a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.