Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"LoRA","local":"lora","sections":[{"title":"Initialization","local":"initialization","sections":[{"title":"PiSSA","local":"pissa","sections":[],"depth":3},{"title":"CorDA","local":"corda","sections":[],"depth":3},{"title":"OLoRA","local":"olora","sections":[],"depth":3},{"title":"EVA","local":"eva","sections":[],"depth":3},{"title":"LoftQ","local":"loftq","sections":[{"title":"Standard approach","local":"standard-approach","sections":[],"depth":4}],"depth":3},{"title":"Rank-stabilized LoRA","local":"rank-stabilized-lora","sections":[],"depth":3},{"title":"LoRA-GA","local":"lora-ga","sections":[],"depth":3}],"depth":2},{"title":"Variants","local":"variants","sections":[{"title":"Weight-Decomposed Low-Rank Adaptation (DoRA)","local":"weight-decomposed-low-rank-adaptation-dora","sections":[{"title":"Optimization","local":"optimization","sections":[],"depth":4},{"title":"Caveats","local":"caveats","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Training","local":"training","sections":[{"title":"QLoRA-style training","local":"qlora-style-training","sections":[],"depth":3},{"title":"Memory efficient Layer Replication with LoRA","local":"memory-efficient-layer-replication-with-lora","sections":[],"depth":3},{"title":"Fine grained control over ranks and alpha (scaling)","local":"fine-grained-control-over-ranks-and-alpha-scaling","sections":[],"depth":3},{"title":"Targeting nn.Parameter directly","local":"targeting-nnparameter-directly","sections":[{"title":"Caveats","local":"caveats","sections":[],"depth":4},{"title":"MoE expert parameters and vLLM","local":"moe-expert-parameters-and-vllm","sections":[],"depth":4}],"depth":3},{"title":"Efficiently train tokens alongside LoRA","local":"efficiently-train-tokens-alongside-lora","sections":[],"depth":3},{"title":"Weight tying","local":"weight-tying","sections":[{"title":"modules_to_save","local":"modulestosave","sections":[],"depth":4},{"title":"target_modules","local":"targetmodules","sections":[],"depth":4},{"title":"trainable_token_indices","local":"trainabletokenindices","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Optimizers","local":"optimizers","sections":[{"title":"LoRA-FA Optimizer","local":"lora-fa-optimizer","sections":[],"depth":3},{"title":"LoRA+ optimized LoRA","local":"lora-optimized-lora","sections":[],"depth":3}],"depth":2},{"title":"Post-Training","local":"post-training","sections":[{"title":"Merge LoRA weights into the base model","local":"merge-lora-weights-into-the-base-model","sections":[],"depth":3},{"title":"Recovering base model performance via intruder dimension reduction","local":"recovering-base-model-performance-via-intruder-dimension-reduction","sections":[],"depth":3}],"depth":2},{"title":"Load adapters","local":"load-adapters","sections":[],"depth":2},{"title":"Tensor Parallelism","local":"tensor-parallelism","sections":[],"depth":2},{"title":"Inference","local":"inference","sections":[{"title":"Activated LoRA (aLoRA)","local":"activated-lora-alora","sections":[{"title":"Choice of invocation sequence and SFT design","local":"choice-of-invocation-sequence-and-sft-design","sections":[],"depth":4},{"title":"Using (and reusing) cache for generation","local":"using-and-reusing-cache-for-generation","sections":[],"depth":4}],"depth":3},{"title":"Inference with different LoRA adapters in the same batch","local":"inference-with-different-lora-adapters-in-the-same-batch","sections":[{"title":"Caveats","local":"caveats","sections":[],"depth":4}],"depth":3},{"title":"Composing and Reusing LoRA Adapters","local":"composing-and-reusing-lora-adapters","sections":[{"title":"Arrow","local":"arrow","sections":[],"depth":4},{"title":"GenKnowSub","local":"genknowsub","sections":[],"depth":4}],"depth":3}],"depth":2}],"depth":1}"> | |
| <link href="/docs/peft/pr_3207/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/entry/start.dbacf4b8.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/scheduler.78382b47.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/singletons.2397427a.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/index.fadd215c.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/paths.6fea2ad6.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/entry/app.cb03e0d4.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/preload-helper.2f407600.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/index.6dd35eb6.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/nodes/0.18735116.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/nodes/11.b8b581b5.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.d25d6883.js"> | |
| <link rel="modulepreload" href="/docs/peft/pr_3207/en/_app/immutable/chunks/CodeBlock.147ab5db.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"LoRA","local":"lora","sections":[{"title":"Initialization","local":"initialization","sections":[{"title":"PiSSA","local":"pissa","sections":[],"depth":3},{"title":"CorDA","local":"corda","sections":[],"depth":3},{"title":"OLoRA","local":"olora","sections":[],"depth":3},{"title":"EVA","local":"eva","sections":[],"depth":3},{"title":"LoftQ","local":"loftq","sections":[{"title":"Standard approach","local":"standard-approach","sections":[],"depth":4}],"depth":3},{"title":"Rank-stabilized LoRA","local":"rank-stabilized-lora","sections":[],"depth":3},{"title":"LoRA-GA","local":"lora-ga","sections":[],"depth":3}],"depth":2},{"title":"Variants","local":"variants","sections":[{"title":"Weight-Decomposed Low-Rank Adaptation (DoRA)","local":"weight-decomposed-low-rank-adaptation-dora","sections":[{"title":"Optimization","local":"optimization","sections":[],"depth":4},{"title":"Caveats","local":"caveats","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Training","local":"training","sections":[{"title":"QLoRA-style training","local":"qlora-style-training","sections":[],"depth":3},{"title":"Memory efficient Layer Replication with LoRA","local":"memory-efficient-layer-replication-with-lora","sections":[],"depth":3},{"title":"Fine grained control over ranks and alpha (scaling)","local":"fine-grained-control-over-ranks-and-alpha-scaling","sections":[],"depth":3},{"title":"Targeting nn.Parameter directly","local":"targeting-nnparameter-directly","sections":[{"title":"Caveats","local":"caveats","sections":[],"depth":4},{"title":"MoE expert parameters and vLLM","local":"moe-expert-parameters-and-vllm","sections":[],"depth":4}],"depth":3},{"title":"Efficiently train tokens alongside LoRA","local":"efficiently-train-tokens-alongside-lora","sections":[],"depth":3},{"title":"Weight tying","local":"weight-tying","sections":[{"title":"modules_to_save","local":"modulestosave","sections":[],"depth":4},{"title":"target_modules","local":"targetmodules","sections":[],"depth":4},{"title":"trainable_token_indices","local":"trainabletokenindices","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Optimizers","local":"optimizers","sections":[{"title":"LoRA-FA Optimizer","local":"lora-fa-optimizer","sections":[],"depth":3},{"title":"LoRA+ optimized LoRA","local":"lora-optimized-lora","sections":[],"depth":3}],"depth":2},{"title":"Post-Training","local":"post-training","sections":[{"title":"Merge LoRA weights into the base model","local":"merge-lora-weights-into-the-base-model","sections":[],"depth":3},{"title":"Recovering base model performance via intruder dimension reduction","local":"recovering-base-model-performance-via-intruder-dimension-reduction","sections":[],"depth":3}],"depth":2},{"title":"Load adapters","local":"load-adapters","sections":[],"depth":2},{"title":"Tensor Parallelism","local":"tensor-parallelism","sections":[],"depth":2},{"title":"Inference","local":"inference","sections":[{"title":"Activated LoRA (aLoRA)","local":"activated-lora-alora","sections":[{"title":"Choice of invocation sequence and SFT design","local":"choice-of-invocation-sequence-and-sft-design","sections":[],"depth":4},{"title":"Using (and reusing) cache for generation","local":"using-and-reusing-cache-for-generation","sections":[],"depth":4}],"depth":3},{"title":"Inference with different LoRA adapters in the same batch","local":"inference-with-different-lora-adapters-in-the-same-batch","sections":[{"title":"Caveats","local":"caveats","sections":[],"depth":4}],"depth":3},{"title":"Composing and Reusing LoRA Adapters","local":"composing-and-reusing-lora-adapters","sections":[{"title":"Arrow","local":"arrow","sections":[],"depth":4},{"title":"GenKnowSub","local":"genknowsub","sections":[],"depth":4}],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LoRA</span></h1> <p data-svelte-h="svelte-1677fky">LoRA is low-rank decomposition method to reduce the number of trainable parameters which speeds up finetuning large models and uses less memory. In PEFT, using LoRA is as easy as setting up a <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a> and wrapping it with <a href="/docs/peft/pr_3207/en/package_reference/peft_model#peft.get_peft_model">get_peft_model()</a> to create a trainable <a href="/docs/peft/pr_3207/en/package_reference/peft_model#peft.PeftModel">PeftModel</a>.</p> <p data-svelte-h="svelte-ovsn32">This guide explores in more detail other options and features for using LoRA.</p> <h2 class="relative group"><a id="initialization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#initialization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Initialization</span></h2> <p data-svelte-h="svelte-818g51">The initialization of LoRA weights is controlled by the parameter <code>init_lora_weights</code> in <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a>. By default, PEFT initializes LoRA weights with Kaiming-uniform for weight A and zeros for weight B resulting in an identity transform (same as the reference <a href="https://github.com/microsoft/LoRA" rel="nofollow">implementation</a>).</p> <p data-svelte-h="svelte-1fujb7a">It is also possible to pass <code>init_lora_weights="gaussian"</code>. As the name suggests, this initializes weight A with a Gaussian distribution and zeros for weight B (this is how <a href="https://huggingface.co/docs/diffusers/index" rel="nofollow">Diffusers</a> initializes LoRA weights).</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(init_lora_weights=<span class="hljs-string">"gaussian"</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mwvui1">There is also an option to set <code>init_lora_weights=False</code> which is useful for debugging and testing. This should be the only time you use this option. When choosing this option, the LoRA weights are initialized such that they do <em>not</em> result in an identity transform.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(init_lora_weights=<span class="hljs-literal">False</span>, ...)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="pissa" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pissa"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>PiSSA</span></h3> <p data-svelte-h="svelte-1dv6v3"><a href="https://huggingface.co/papers/2404.02948" rel="nofollow">PiSSA</a> initializes the LoRA adapter using the principal singular values and singular vectors. This straightforward modification allows PiSSA to converge more rapidly than LoRA and ultimately attain superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, leading to further enhancements.</p> <p data-svelte-h="svelte-1hy0lgx">Configure the initialization method to “pissa”, which may take several minutes to execute SVD on the pre-trained model:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(init_lora_weights=<span class="hljs-string">"pissa"</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-auzrbb">Alternatively, execute fast SVD, which takes only a few seconds. The number of iterations determines the trade-off between the error and computation time:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->lora_config = LoraConfig(init_lora_weights=<span class="hljs-string">"pissa_niter_[number of iters]"</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16k39wq">For detailed instruction on using PiSSA, please follow <a href="https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning" rel="nofollow">these instructions</a>.</p> <h3 class="relative group"><a id="corda" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#corda"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>CorDA</span></h3> <p data-svelte-h="svelte-53sj5j"><a href="https://huggingface.co/papers/2406.05223" rel="nofollow">CorDA</a> builds task-aware LoRA adapters from weight decomposition oriented by the context of downstream task to learn (instruction-previewed mode, IPM) or world knowledge to maintain (knowledge-preserved mode, KPM). | |
| The KPM not only achieves better performance than LoRA on fine-tuning tasks, but also mitigates the catastrophic forgetting of pre-trained world knowledge. | |
| When preserving pre-trained knowledge is not a concern, | |
| the IPM is favored because it can further accelerate convergence and enhance the fine-tuning performance.</p> <p data-svelte-h="svelte-1s3c5rd">You need to configure the initialization method to “corda”, and specify the mode of IPM or KPM and the dataset to collect covariance matrices.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">@torch.no_grad()</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">run_model</span>(): | |
| <span class="hljs-comment"># Assume `model` and `dataset` is in context...</span> | |
| model.<span class="hljs-built_in">eval</span>() | |
| <span class="hljs-keyword">for</span> batch <span class="hljs-keyword">in</span> dataset: | |
| model(**batch) | |
| corda_config = CordaConfig( | |
| corda_method=<span class="hljs-string">"kpm"</span>, | |
| ) | |
| lora_config = LoraConfig( | |
| init_lora_weights=<span class="hljs-string">"corda"</span>, | |
| corda_config=corda_config, | |
| ) | |
| preprocess_corda(model, lora_config, run_model=run_model) | |
| peft_model = get_peft_model(model, lora_config)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19qemwc">For detailed instruction on using CorDA, please follow <a href="https://github.com/huggingface/peft/tree/main/examples/corda_finetuning" rel="nofollow">these instructions</a>.</p> <h3 class="relative group"><a id="olora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#olora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>OLoRA</span></h3> <p data-svelte-h="svelte-1iny7jf"><a href="https://huggingface.co/papers/2406.01775" rel="nofollow">OLoRA</a> utilizes QR decomposition to initialize the LoRA adapters. OLoRA translates the base weights of the model by a factor of their QR decompositions, i.e., it mutates the weights before performing any training on them. This approach significantly improves stability, accelerates convergence speed, and ultimately achieves superior performance.</p> <p data-svelte-h="svelte-1j9g8qn">You just need to pass a single additional option to use OLoRA:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(init_lora_weights=<span class="hljs-string">"olora"</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sulxp4">For more advanced usage, please refer to our <a href="https://github.com/huggingface/peft/tree/main/examples/olora_finetuning" rel="nofollow">documentation</a>.</p> <h3 class="relative group"><a id="eva" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#eva"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>EVA</span></h3> <p data-svelte-h="svelte-zsbb0y"><a href="https://huggingface.co/papers/2410.07170" rel="nofollow">EVA</a> performs SVD on the input activations of each layer and uses the right-singular vectors to initialize LoRA weights. It is therefore a data-driven initialization scheme. Furthermore EVA adaptively allocates ranks across layers based on their “explained variance ratio” - a metric derived from the SVD analysis.</p> <p data-svelte-h="svelte-oqv0us">You can use EVA by setting <code>init_lora_weights="eva"</code> and defining <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.EvaConfig">EvaConfig</a> in <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig, EvaConfig | |
| peft_config = LoraConfig( | |
| init_lora_weights = <span class="hljs-string">"eva"</span>, | |
| eva_config = EvaConfig(rho = <span class="hljs-number">2.0</span>), | |
| ... | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-168zxvb">The parameter <code>rho</code> (≥ 1.0) determines how much redistribution is allowed. When <code>rho=1.0</code> and <code>r=16</code>, LoRA adapters are limited to exactly 16 ranks, preventing any redistribution from occurring. A recommended value for EVA with redistribution is 2.0, meaning the maximum rank allowed for a layer is 2r.</p> <p data-svelte-h="svelte-39tyen">It is recommended to perform EVA initialization on an accelerator(e.g. CUDA GPU, Intel XPU) as it is much faster. To optimize the amount of available memory for EVA, you can use the <code>low_cpu_mem_usage</code> flag in <a href="/docs/peft/pr_3207/en/package_reference/peft_model#peft.get_peft_model">get_peft_model()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->peft_model = get_peft_model(model, peft_config, low_cpu_mem_usage=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1edy5ig">Then, call <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.initialize_lora_eva_weights">initialize_lora_eva_weights()</a> to initialize the EVA weights (in most cases the dataloader used for eva initialization can be the same as the one used for finetuning):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->initialize_lora_eva_weights(peft_model, dataloader)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ofx187">EVA works out of the box with bitsandbytes. Simply initialize the model with <code>quantization_config</code> and call <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.initialize_lora_eva_weights">initialize_lora_eva_weights()</a> as usual.</p> <blockquote class="tip" data-svelte-h="svelte-2xjfr6"><p>For further instructions on using EVA, please refer to our <a href="https://github.com/huggingface/peft/tree/main/examples/eva_finetuning" rel="nofollow">documentation</a>.</p></blockquote> <h3 class="relative group"><a id="loftq" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loftq"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LoftQ</span></h3> <h4 class="relative group"><a id="standard-approach" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#standard-approach"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Standard approach</span></h4> <p data-svelte-h="svelte-icixot">When quantizing the base model for QLoRA training, consider using the <a href="https://huggingface.co/papers/2310.08659" rel="nofollow">LoftQ initialization</a>, which has been shown to improve performance when training quantized models. The idea is that the LoRA weights are initialized such that the quantization error is minimized. To use LoftQ, follow <a href="https://github.com/huggingface/peft/tree/main/examples/loftq_finetuning" rel="nofollow">these instructions</a>.</p> <blockquote class="tip" data-svelte-h="svelte-p2g389"><p>Learn more about how PEFT works with quantization and how to use LoftQ in the <a href="quantization">Quantization</a> guide.</p></blockquote> <h3 class="relative group"><a id="rank-stabilized-lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rank-stabilized-lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rank-stabilized LoRA</span></h3> <p data-svelte-h="svelte-mfb0ip">Another way to initialize <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a> is with the <a href="https://huggingface.co/papers/2312.03732" rel="nofollow">rank-stabilized LoRA (rsLoRA)</a> method. The LoRA architecture scales each adapter during every forward pass by a fixed scalar which is set at initialization and depends on the rank <code>r</code>. The scalar is given by <code>lora_alpha/r</code> in the original implementation, but rsLoRA uses <code>lora_alpha/math.sqrt(r)</code> which stabilizes the adapters and increases the performance potential from using a higher <code>r</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(use_rslora=<span class="hljs-literal">True</span>, ...)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="lora-ga" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lora-ga"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LoRA-GA</span></h3> <p data-svelte-h="svelte-19ylu0a"><a href="../package_reference/lora#lora-ga">LoRA-GA</a> (Low-Rank Adaptation with Gradient Approximation) initializes the adapter | |
| weights by performing SVD on estimated gradients, so that the weights are aligning closer to full-finetuning for faster | |
| convergence.</p> <p data-svelte-h="svelte-wa0is0">This method requires an initialization function to estimate the gradients | |
| before beginning the actual training:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft.tuners.lora <span class="hljs-keyword">import</span> preprocess_loraga | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">train_step</span>(): | |
| <span class="hljs-string">"""Run forward and backward passes for gradient estimation."""</span> | |
| dataloader_iter = <span class="hljs-built_in">iter</span>(grad_dataloader) | |
| <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(N): | |
| batch = <span class="hljs-built_in">next</span>(dataloader_iter) | |
| batch = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> batch.items()} | |
| outputs = model(**batch) | |
| loss = outputs.loss | |
| loss.backward() | |
| preprocess_loraga(model, lora_config, train_step)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="variants" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#variants"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Variants</span></h2> <p data-svelte-h="svelte-6vi5lg">PEFT implements LoRA variants that improve upon the original LoRA.</p> <h3 class="relative group"><a id="weight-decomposed-low-rank-adaptation-dora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#weight-decomposed-low-rank-adaptation-dora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Weight-Decomposed Low-Rank Adaptation (DoRA)</span></h3> <p data-svelte-h="svelte-x8nt61">This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see <a href="https://huggingface.co/papers/2402.09353" rel="nofollow">https://huggingface.co/papers/2402.09353</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(use_dora=<span class="hljs-literal">True</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8wzhde">If parts of the model or the DoRA adapter are offloaded to CPU you can get a significant speedup at the cost of some temporary (ephemeral) VRAM overhead by using <code>ephemeral_gpu_offload=True</code> in <code>config.runtime_config</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig, LoraRuntimeConfig | |
| config = LoraConfig(use_dora=<span class="hljs-literal">True</span>, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=<span class="hljs-literal">True</span>), ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16lyt50">A <code>PeftModel</code> with a DoRA adapter can also be loaded with <code>ephemeral_gpu_offload=True</code> flag using the <code>from_pretrained</code> method as well as the <code>load_adapter</code> method.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel | |
| model = PeftModel.from_pretrained(base_model, peft_model_id, ephemeral_gpu_offload=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="optimization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimization</span></h4> <p data-svelte-h="svelte-8z5o72">DoRA is optimized (computes faster and takes less memory) for models in the evaluation mode, or when dropout is set to 0. We reuse the | |
| base result at those times to get the speedup. | |
| Running <a href="https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora_finetuning.py" rel="nofollow">dora finetuning</a> | |
| with <code>CUDA_VISIBLE_DEVICES=0 ZE_AFFINITY_MASK=0 time python examples/dora_finetuning/dora_finetuning.py --quantize --lora_dropout 0 --batch_size 16 --eval_step 2 --use_dora</code> on a 4090 with gradient accumulation set to 2 and max step to 20 resulted with the following observations:</p> <table data-svelte-h="svelte-1dslnm4"><thead><tr><th align="center"></th> <th align="center">Without Optimization</th> <th align="center">With Optimization</th></tr></thead> <tbody><tr><td align="center">train runtime (sec)</td> <td align="center">359.7298</td> <td align="center"><strong>279.2676</strong></td></tr> <tr><td align="center">train samples per second</td> <td align="center">1.779</td> <td align="center"><strong>2.292</strong></td></tr> <tr><td align="center">train steps per second</td> <td align="center">0.056</td> <td align="center"><strong>0.072</strong></td></tr></tbody></table> <p data-svelte-h="svelte-6epr3r">Moreover, it is possible to further increase runtime performance of DoRA by using the <code>DoraCaching</code> helper context. This requires the model to be in <code>eval</code> mode:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft.helpers <span class="hljs-keyword">import</span> DoraCaching | |
| model.<span class="hljs-built_in">eval</span>() | |
| <span class="hljs-keyword">with</span> DoraCaching(): | |
| output = model(inputs)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8kmgme">For <a href="https://huggingface.co/meta-llama/Llama-3.1-8B" rel="nofollow"><code>meta-llama/Llama-3.1-8B</code></a>, the <a href="https://github.com/huggingface/peft/blob/main/examples/dora_finetuning/dora-caching.py" rel="nofollow">DoRA caching benchmark script</a> shows that, compared to LoRA:</p> <ul data-svelte-h="svelte-r8pzuc"><li>DoRA without caching requires 139% more time</li> <li>DoRA without caching requires 4% more memory</li> <li>DoRA with caching requires 17% more time</li> <li>DoRA with caching requires 41% more memory</li></ul> <p data-svelte-h="svelte-86zkpy">Caching can thus make inference with DoRA significantly faster but it also requires signficantly more memory. Ideally, if the use case allows it, just merge the DoRA adapter to avoid both memory and runtime overhead.</p> <h4 class="relative group"><a id="caveats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#caveats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Caveats</span></h4> <ul data-svelte-h="svelte-r773n2"><li>DoRA only supports embedding, linear, and Conv2d layers at the moment.</li> <li>DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference, see <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.merge_and_unload">LoraModel.merge_and_unload()</a>.</li> <li>DoRA should work with weights quantized with bitsandbytes (“QDoRA”). However, issues have been reported when using QDoRA with DeepSpeed Zero2.</li></ul> <h2 class="relative group"><a id="training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training</span></h2> <p data-svelte-h="svelte-1r3v4w">This section shows how to handle more complex training scenarios instead of only applying a low-rank adapter | |
| to the model and feed data.</p> <h3 class="relative group"><a id="qlora-style-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#qlora-style-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>QLoRA-style training</span></h3> <p data-svelte-h="svelte-15vxux2">The default LoRA settings in PEFT add trainable weights to the query and value layers of each attention block. But <a href="https://hf.co/papers/2305.14314" rel="nofollow">QLoRA</a>, which adds trainable weights to all the linear layers of a transformer model, can provide performance equal to a fully finetuned model. To apply LoRA to all the linear layers, like in QLoRA, set <code>target_modules="all-linear"</code> (easier than specifying individual modules by name which can vary depending on the architecture).</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->config = LoraConfig(target_modules=<span class="hljs-string">"all-linear"</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-eo2lbi">For more information about how to apply quantization to PEFT adapters, refer to the <a href="quantization">quantization guide</a>.</p> <h3 class="relative group"><a id="memory-efficient-layer-replication-with-lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-efficient-layer-replication-with-lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory efficient Layer Replication with LoRA</span></h3> <p data-svelte-h="svelte-ku1fuj">An approach used to improve the performance of models is to expand a model by duplicating layers in the model to build a larger model from a pretrained model of a given size. For example increasing a 7B model to a 10B model as described in the <a href="https://huggingface.co/papers/2312.15166" rel="nofollow">SOLAR</a> paper. PEFT LoRA supports this kind of expansion in a memory efficient manner that supports further fine-tuning using LoRA adapters attached to the layers post replication of the layers. The replicated layers do not take additional memory as they share the underlying weights so the only additional memory required is the memory for the adapter weights. To use this feature you would create a config with the <code>layer_replication</code> argument.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->config = LoraConfig(layer_replication=[[<span class="hljs-number">0</span>,<span class="hljs-number">4</span>], [<span class="hljs-number">2</span>,<span class="hljs-number">5</span>]], ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ttqqw0">Assuming the original model had 5 layers <code>[0, 1, 2 ,3, 4]</code>, this would create a model with 7 layers arranged as <code>[0, 1, 2, 3, 2, 3, 4]</code>. This follows the <a href="https://github.com/arcee-ai/mergekit" rel="nofollow">mergekit</a> pass through merge convention where sequences of layers specified as start inclusive and end exclusive tuples are stacked to build the final model. Each layer in the final model gets its own distinct set of LoRA adapters.</p> <p data-svelte-h="svelte-foj0wd"><a href="https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B" rel="nofollow">Fewshot-Metamath-OrcaVicuna-Mistral-10B</a> is an example of a model trained using this method on Mistral-7B expanded to 10B. The | |
| <a href="https://huggingface.co/abacusai/Fewshot-Metamath-OrcaVicuna-Mistral-10B/blob/main/adapter_config.json" rel="nofollow">adapter_config.json</a> shows a sample LoRA adapter config applying this method for fine-tuning.</p> <h3 class="relative group"><a id="fine-grained-control-over-ranks-and-alpha-scaling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fine-grained-control-over-ranks-and-alpha-scaling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fine grained control over ranks and alpha (scaling)</span></h3> <p data-svelte-h="svelte-kq4poh">By default, all layers targeted with LoRA will have the same rank <code>r</code> and the same <code>lora_alpha</code> (which determines the LoRA scaling), depending on what was specified in the <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a>. In some cases, however, you may want to indicate different values for different layers. This is possible by passing the <code>rank_pattern</code> and <code>alpha_pattern</code> arguments to <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a>. These arguments should be dictionaries with the key being the layer name and the value being the rank/alpha value. The keys can be <a href="https://docs.python.org/3/library/re.html" rel="nofollow">regular expressions</a> (regex). All LoRA layers that are not explicitly mentioned in <code>rank_pattern</code> and <code>alpha_pattern</code> will take the default <code>r</code> and <code>lora_alpha</code> values.</p> <p data-svelte-h="svelte-dtt8e">To give an example, let’s assume that we have a model with the following structure:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(model) | |
| Outer( | |
| (foo): Linear(...) | |
| (module): Middle( | |
| (foo): Linear(...) | |
| (foobar): Linear(...) | |
| (module): Inner( | |
| (foo): Linear(...) | |
| (barfoo): Linear(...) | |
| ) | |
| ) | |
| )<!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-1qf6hnm"><li><code>rank_pattern={"foo": 42}</code> will match all 3 <code>foo</code> layers. Neither <code>foobar</code> nor <code>barfoo</code> are matched.</li> <li><code>rank_pattern={"^foo": 42}</code> will only match the <code>foo</code> layer of the model, but neither <code>module.foo</code> nor <code>module.module.foo</code>. This is because the <code>^</code> means “start of string” when using regular expressions, and only <code>foo</code> starts with <code>"foo"</code>, the other layer names have prefixes.</li> <li><code>rank_pattern={"^module.foo": 42}</code> matches only <code>module.foo</code>, but not <code>module.module.foo</code>, for the same reason.</li> <li><code>rank_pattern={"module.foo": 42}</code> matches both <code>module.foo</code> and <code>module.module.foo</code>, but not <code>foo</code>.</li> <li><code>rank_pattern={"^foo": 42, "^module.module.foo": 55}</code> matches <code>foo</code> and <code>module.module.foo</code>, respectively, but not <code>module.foo</code>.</li> <li>There is no need to indicate <code>$</code> to mark the end of the match, as this is added automatically by PEFT.</li></ul> <p data-svelte-h="svelte-qh5siy">The same logic applies to <code>alpha_pattern</code>. If you’re in doubt, don’t try to get fancy with regular expressions — just pass the full name for each module with a different rank/alpha, preceded by the <code>^</code> prefix, and you should be good.</p> <h3 class="relative group"><a id="targeting-nnparameter-directly" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#targeting-nnparameter-directly"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Targeting nn.Parameter directly</span></h3> <p data-svelte-h="svelte-jvkb1r">Generally, you should use <code>target_modules</code> to target the module (e.g. <code>nn.Linear</code>). However, in some circumstances, this is not possible. E.g., in many mixture of expert (MoE) layers in HF Transformers, instead of using <code>nn.Linear</code>, an <code>nn.Parameter</code> is used. PEFT normally overwrites the <code>forward</code> method for LoRA, but for <code>nn.Parameter</code>, there is none. Therefore, to apply LoRA to that parameter, it needs to be targeted with <code>target_parameters</code>. As an example, for <a href="https://huggingface.co/collections/meta-llama/llama-4-67f0c30d9fe03840bc9d0164" rel="nofollow">Llama4</a>, you can pass: <code>target_parameters=['feed_forward.experts.gate_up_proj', 'feed_forward.experts.down_proj]</code>.</p> <p data-svelte-h="svelte-1fvn09m">Note that when targeting expert parameters, PEFT can add a substantial runtime overhead. The reason is that PEFT always materializes the LoRA contribution for <em>each expert</em> even if only a small amount of experts is required. During training, this is less relevant since, over the course of the sequence, typically a large fraction of experts is activated at least once. However, during inference, normally a KV cache is used and we thus need to only compute the last token, which means that only a small amount of experts is activated. Therefore, using LoRA on MoE layers can result in a substantial slowdown at inference time. The recommendation is thus to merge the weights (<code>model.merge_adapter()</code> or <code>model = model.merge_and_unload()</code>). This removes the PEFT overhead.</p> <p data-svelte-h="svelte-ecn69p">A more detailed investigation of this issue can be found on this <a href="https://github.com/huggingface/peft/pull/3139" rel="nofollow">pull request on MoE optimization</a>.</p> <h4 class="relative group"><a id="caveats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#caveats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Caveats</span></h4> <ul data-svelte-h="svelte-1h2fpez"><li>At the moment, this argument allows to target 2-dim or 3-dim <code>nn.Parameter</code>s. It is assumed that in the case of a 3-dim parameter, the 0th dimension is the expert dimension.</li> <li>It is currently not possible to add multiple LoRA adapters (via <code>model.add_adapter</code> or <code>model.load_adapter</code>) that use <code>target_parameters</code> at the same time.</li></ul> <h4 class="relative group"><a id="moe-expert-parameters-and-vllm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#moe-expert-parameters-and-vllm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>MoE expert parameters and vLLM</span></h4> <p data-svelte-h="svelte-z4cpft">Some MoE models in Transformers store expert weights as <code>nn.Parameter</code> tensors (often 3D), not <code>nn.Linear</code> modules. | |
| To apply LoRA to those experts, use <code>target_parameters</code> and set a per-layer rank with <code>rank_pattern</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->num_experts = <span class="hljs-built_in">getattr</span>(model.config, <span class="hljs-string">"num_local_experts"</span>, <span class="hljs-literal">None</span>) <span class="hljs-keyword">or</span> model.config.num_experts | |
| effective_r = <span class="hljs-built_in">max</span>(<span class="hljs-number">1</span>, r // num_experts) | |
| config = LoraConfig( | |
| r=r, | |
| lora_alpha=<span class="hljs-number">32</span>, | |
| target_modules=[<span class="hljs-string">"q_proj"</span>, <span class="hljs-string">"v_proj"</span>], | |
| target_parameters=[ | |
| <span class="hljs-comment"># Mixtral / Qwen3-MoE / GPT-OSS</span> | |
| <span class="hljs-string">"mlp.experts.gate_up_proj"</span>, | |
| <span class="hljs-string">"mlp.experts.down_proj"</span>, | |
| <span class="hljs-comment"># Llama4</span> | |
| <span class="hljs-comment"># "feed_forward.experts.gate_up_proj",</span> | |
| <span class="hljs-comment"># "feed_forward.experts.down_proj",</span> | |
| ], | |
| rank_pattern={ | |
| <span class="hljs-string">"experts.gate_up_proj"</span>: effective_r, | |
| <span class="hljs-string">"experts.down_proj"</span>: effective_r, | |
| }, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jc16sb">This keeps the total LoRA parameter budget similar to dense layers (see | |
| <a href="https://thinkingmachines.ai/blog/lora/" rel="nofollow">LoRA Without Regret</a> by Schulman et. al.). | |
| Non-expert modules use the default rank <code>r</code>.</p> <p data-svelte-h="svelte-19jfwul">Accelerated inference with the fine-tuned model is possible with, for example, <a href="https://vllm.ai/" rel="nofollow">vLLM</a> which supports fused MoE expert layers since v0.11.2.</p> <h3 class="relative group"><a id="efficiently-train-tokens-alongside-lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#efficiently-train-tokens-alongside-lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Efficiently train tokens alongside LoRA</span></h3> <p data-svelte-h="svelte-1j6km4p">PEFT LoRA adapters support adding new tokens with the <code>trainable_token_indices</code> parameter. This allows tuning of other tokens alongside fine-tuning specific layers. Only the specified tokens are trained and all other tokens are untouched. It saves memory and doesn’t throw away learned context from existing token embeddings unlike training the whole embedding matrix. Under the hood this method uses the layer of <a href="/docs/peft/pr_3207/en/package_reference/trainable_tokens#peft.TrainableTokensModel">TrainableTokensModel</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># for layer 'embed_tokens'</span> | |
| config = LoraConfig(trainable_token_indices=[idx_1, idx_2, ...], ...) | |
| <span class="hljs-comment"># specific embedding layer</span> | |
| config = LoraConfig(trainable_token_indices={<span class="hljs-string">'emb_tokens'</span>: [idx_1, idx_2, ...]}, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-66nz7l">In the snippet below we show how to add new tokens to the model and how to train it alongside the other layers in the model.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> get_peft_model, LoraConfig | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>) | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>) | |
| <span class="hljs-comment"># we define our new tokens and add them to the tokenizer as special tokens</span> | |
| special_tokens = [<span class="hljs-string">'<|start_think|>'</span>, <span class="hljs-string">'<|stop_think|>'</span>] | |
| tokenizer.add_special_tokens({<span class="hljs-string">'additional_special_tokens'</span>: special_tokens}) | |
| <span class="hljs-comment"># make room for new tokens in the embedding matrix if it isn't big enough already</span> | |
| base_model.resize_token_embeddings(<span class="hljs-built_in">max</span>(<span class="hljs-built_in">len</span>(tokenizer), base_model.model.embed_tokens.num_embeddings)) | |
| <span class="hljs-comment"># typical LoRA config with `trainable_token_indices` targeting embedding layer `embed_tokens`</span> | |
| <span class="hljs-comment"># and specifically our new tokens we just added</span> | |
| lora_config = LoraConfig( | |
| target_modules=<span class="hljs-string">'all-linear'</span>, | |
| trainable_token_indices={<span class="hljs-string">'embed_tokens'</span>: tokenizer.convert_tokens_to_ids(special_tokens)}, | |
| ) | |
| peft_model = get_peft_model(base_model, lora_config) | |
| <span class="hljs-comment"># proceed to train the model like normal</span> | |
| [...]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n4djpz">The token weights are saved as a part of the adapter state dict alongside the LoRA weights. Full fine-tuning and saving the embedding matrix would have stored a much bigger file.</p> <p data-svelte-h="svelte-19mzdup">To give a bit of an indication how much VRAM can be saved, a rudimentary comparison of the above example was made between training the embedding matrix fully (<code>modules_to_save=["embed_tokens"]</code>), using a LoRA for the embedding matrix (<code>target_modules=[..., "embed_tokens"]</code>, rank 32) and trainable tokens (<code>trainable_token_indices=[...]</code>, 6 tokens):</p> <table data-svelte-h="svelte-zhxypc"><thead><tr><th align="right"></th> <th align="center">Trainable Tokens</th> <th align="center">LoRA</th> <th align="center">Full Fine-tuning</th></tr></thead> <tbody><tr><td align="right">VRAM</td> <td align="center">15,562 MB</td> <td align="center">15,581MB</td> <td align="center">~16,500MB</td></tr> <tr><td align="right">Influence</td> <td align="center">6 tokens</td> <td align="center">all tokens</td> <td align="center">all tokens</td></tr></tbody></table> <h3 class="relative group"><a id="weight-tying" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#weight-tying"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Weight tying</span></h3> <p data-svelte-h="svelte-ex5mu1">Many causal LMs use <strong>weight tying</strong>, where two or more weights share the same underlying parameters. In the most common case, the input embedding weights (<code>embed_tokens</code>) and output projection weights (<code>lm_head</code>) share the same tensor. This is because it reduces parameters and usually preserves model quality.</p> <p data-svelte-h="svelte-10xpw7p">It’s not always obvious how PEFT deals with these tied weights when they are targeted for fine-tuning. For LoRA, the <code>ensure_weight_tying</code> on the <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraConfig">LoraConfig</a> controls whether PEFT should explicitly keep adapter-side updates tied for those layers. In practice, this can affect <code>modules_to_save</code>, <code>target_modules</code>, and <code>trainable_token_indices</code>. Note that this logic partially relies on convention when it comes to naming the layers (<code>"embed_tokens"</code>, <code>"lm_head"</code>) and proper working cannot be guaranteed if those conventions are not used.</p> <p data-svelte-h="svelte-1pk84qw">The tables below summarize expected behavior.</p> <h4 class="relative group"><a id="modulestosave" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#modulestosave"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>modules_to_save</span></h4> <table data-svelte-h="svelte-xtypic"><thead><tr><th>Base model weights tied</th> <th><code>ensure_weight_tying</code></th> <th><code>LoraConfig</code> shape</th> <th>Behavior</th></tr></thead> <tbody><tr><td>No</td> <td><code>False</code></td> <td><code>modules_to_save=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Add <code>ModulesToSaveWrapper</code> on selected layer only</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>modules_to_save=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Warn, then add <code>ModulesToSaveWrapper</code> on selected layer only</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>modules_to_save=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Treat as separate</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>modules_to_save=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Wrap tied layers and keep wrappers tied</td></tr> <tr><td>No</td> <td><code>False</code></td> <td><code>modules_to_save=["embed_tokens", "lm_head"]</code></td> <td>Treat as separate</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>modules_to_save=["embed_tokens", "lm_head"]</code></td> <td>Warn, then treat as separate</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>modules_to_save=["embed_tokens", "lm_head"]</code></td> <td>Warn, then treat as separate</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>modules_to_save=["embed_tokens", "lm_head"]</code></td> <td>Keep <code>ModulesToSaveWrapper</code>s tied</td></tr></tbody></table> <h4 class="relative group"><a id="targetmodules" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#targetmodules"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>target_modules</span></h4> <table data-svelte-h="svelte-e0umdx"><thead><tr><th>Base model weights tied</th> <th><code>ensure_weight_tying</code></th> <th><code>LoraConfig</code> shape</th> <th>Behavior</th></tr></thead> <tbody><tr><td>No</td> <td><code>False</code></td> <td><code>target_modules=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Add LoRA on selected layer only</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>target_modules=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Warn, then add LoRA on selected layer only</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>target_modules=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Treat as separate</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>target_modules=["embed_tokens"]</code> or <code>["lm_head"]</code></td> <td>Keep LoRA adapters tied</td></tr> <tr><td>No</td> <td><code>False</code></td> <td><code>target_modules=["embed_tokens", "lm_head"]</code></td> <td>Treat as separate</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>target_modules=["embed_tokens", "lm_head"]</code></td> <td>Warn, then treat as separate</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>target_modules=["embed_tokens", "lm_head"]</code></td> <td>Warn, then treat as separate</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>target_modules=["embed_tokens", "lm_head"]</code></td> <td>Keep LoRA adapters tied</td></tr></tbody></table> <h4 class="relative group"><a id="trainabletokenindices" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#trainabletokenindices"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>trainable_token_indices</span></h4> <p data-svelte-h="svelte-1atwpg3">For trainable tokens, we have the additional complication that even if the LM head and embeddings are tied, as a user I may want to fine-tune <em>different</em> tokens on them. In the example table below, we thus differentiate between fine-tuning the same and fine-tuning different tokens.</p> <table data-svelte-h="svelte-192ndes"><thead><tr><th>Base model weights tied</th> <th><code>ensure_weight_tying</code></th> <th><code>LoraConfig</code> shape</th> <th>Behavior</th></tr></thead> <tbody><tr><td>No</td> <td><code>False</code></td> <td><code>trainable_token_indices=[1, 2, 3]</code></td> <td>Trainable tokens on embeddings only</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>trainable_token_indices=[1, 2, 3]</code></td> <td>Warn, then trainable tokens on embeddings only</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>trainable_token_indices=[1, 2, 3]</code></td> <td>Tied trainable tokens</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>trainable_token_indices=[1, 2, 3]</code></td> <td>Tied trainable tokens</td></tr> <tr><td>No</td> <td><code>False</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}</code></td> <td>Treat as separate</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}</code></td> <td>Warn, then treat as separate</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}</code></td> <td>Tied trainable tokens</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [1, 2]}</code></td> <td>Tied trainable tokens</td></tr> <tr><td>No</td> <td><code>False</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}</code></td> <td>Treat as separate</td></tr> <tr><td>No</td> <td><code>True</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}</code></td> <td>Warn, then treat as separate</td></tr> <tr><td>Yes</td> <td><code>False</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}</code></td> <td>Treat as separate</td></tr> <tr><td>Yes</td> <td><code>True</code></td> <td><code>trainable_token_indices={"lm_head": [1, 2], "embed_tokens": [3, 4]}</code></td> <td>Error</td></tr></tbody></table> <p data-svelte-h="svelte-1rv3xk7">For users, this means:</p> <ul data-svelte-h="svelte-qamjs7"><li>In general, if you want to fine-tune weights that are tied and want to keep them tied, pass <code>ensure_weight_tying=True</code>.</li> <li>If your base model’s weights are untied, <code>ensure_weight_tying=True</code> cannot force tying and only warns.</li> <li>For <code>trainable_token_indices</code>, tied layers must use the same token indices when <code>ensure_weight_tying=True</code>.</li></ul> <h2 class="relative group"><a id="optimizers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimizers</span></h2> <p data-svelte-h="svelte-ws58tf">LoRA training can optionally include special purpose optimizers. Currently PEFT supports LoRA-FA and LoRA+.</p> <h3 class="relative group"><a id="lora-fa-optimizer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lora-fa-optimizer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LoRA-FA Optimizer</span></h3> <p data-svelte-h="svelte-18tpzlm">LoRA training can be more effective and efficient using LoRA-FA, as described in <a href="https://huggingface.co/papers/2308.03303" rel="nofollow">LoRA-FA</a>. LoRA-FA reduces activation memory consumption by fixing the matrix A and only tuning the matrix B. During training, the gradient of B is optimized to approximate the full parameter fine-tuning gradient. Moreover, the memory consumption of LoRA-FA is not sensitive to the rank (since it erases the activation of $A$), therefore it can improve performance by enlarging lora rank without increasing memory consumption.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig, get_peft_model | |
| <span class="hljs-keyword">from</span> peft.optimizers <span class="hljs-keyword">import</span> create_lorafa_optimizer | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> Trainer, get_cosine_schedule_with_warmup | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"meta-llama/Meta-Llama-3-8B-Instruct"</span>) | |
| config = LoraConfig(...) | |
| model = get_peft_model(base_model, config) | |
| optimizer = create_lorafa_optimizer( | |
| model=model, | |
| r=<span class="hljs-number">128</span>, | |
| lora_alpha=<span class="hljs-number">32</span>, | |
| lr=<span class="hljs-number">7e-5</span>, | |
| ) | |
| scheduler = get_cosine_schedule_with_warmup( | |
| optimizer, | |
| num_warmup_steps=<span class="hljs-number">100</span>, | |
| num_training_steps=<span class="hljs-number">1000</span>, | |
| ) | |
| trainer = Trainer( | |
| ..., | |
| optimizers=(optimizer, scheduler), | |
| )<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="lora-optimized-lora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lora-optimized-lora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LoRA+ optimized LoRA</span></h3> <p data-svelte-h="svelte-1i0u5p9">LoRA training can be optimized using <a href="https://huggingface.co/papers/2402.12354" rel="nofollow">LoRA+</a>, which uses different learning rates for the adapter matrices A and B, shown to increase finetuning speed by up to 2x and performance by 1-2%.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig, get_peft_model | |
| <span class="hljs-keyword">from</span> peft.optimizers <span class="hljs-keyword">import</span> create_loraplus_optimizer | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> Trainer | |
| <span class="hljs-keyword">import</span> bitsandbytes <span class="hljs-keyword">as</span> bnb | |
| base_model = ... | |
| config = LoraConfig(...) | |
| model = get_peft_model(base_model, config) | |
| optimizer = create_loraplus_optimizer( | |
| model=model, | |
| optimizer_cls=bnb.optim.Adam8bit, | |
| lr=<span class="hljs-number">5e-5</span>, | |
| loraplus_lr_ratio=<span class="hljs-number">16</span>, | |
| ) | |
| scheduler = <span class="hljs-literal">None</span> | |
| ... | |
| trainer = Trainer( | |
| ..., | |
| optimizers=(optimizer, scheduler), | |
| )<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="post-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#post-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Post-Training</span></h2> <p data-svelte-h="svelte-10bblrn">This section shows potential post-processing methods for trained adapters.</p> <h3 class="relative group"><a id="merge-lora-weights-into-the-base-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#merge-lora-weights-into-the-base-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Merge LoRA weights into the base model</span></h3> <p data-svelte-h="svelte-1mcg79c">While LoRA is significantly smaller and faster to train, you may encounter latency issues during inference due to separately loading the base model and the LoRA adapter. To eliminate latency, use the <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.merge_and_unload">merge_and_unload()</a> function to merge the adapter weights with the base model. This allows you to use the newly merged model as a standalone model. The <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.merge_and_unload">merge_and_unload()</a> function doesn’t keep the adapter weights in memory.</p> <p data-svelte-h="svelte-1wwmf5r">Below is a diagram that explains the intuition of LoRA adapter merging:</p> <div class="flex justify-center" data-svelte-h="svelte-1f6iuw5"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/peft/lora_diagram.png"></div> <p data-svelte-h="svelte-hlzwis">We show in the snippets below how to run that using PEFT.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>) | |
| peft_model_id = <span class="hljs-string">"alignment-handbook/zephyr-7b-sft-lora"</span> | |
| model = PeftModel.from_pretrained(base_model, peft_model_id) | |
| model = model.merge_and_unload()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-i4iz7u">It is important to assign the returned model to a variable and use it, <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.merge_and_unload">merge_and_unload()</a> is not an in-place operation. If you need to keep a copy of the weights so you can unmerge the adapter later or delete and load different ones, you should use the <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.merge_adapter">merge_adapter()</a> function instead. Now you have the option to use <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.unmerge_adapter">unmerge_adapter()</a> to return the base model.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>) | |
| peft_model_id = <span class="hljs-string">"alignment-handbook/zephyr-7b-sft-lora"</span> | |
| model = PeftModel.from_pretrained(base_model, peft_model_id) | |
| model.merge_adapter() | |
| <span class="hljs-comment"># unmerge the LoRA layers from the base model</span> | |
| model.unmerge_adapter()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-b2k1nb">The <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.LoraModel.add_weighted_adapter">add_weighted_adapter()</a> function is useful for merging multiple LoRAs into a new adapter based on a user provided weighting scheme in the <code>weights</code> parameter. Below is an end-to-end example.</p> <p data-svelte-h="svelte-1mwdwt1">First load the base model:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel | |
| <span class="hljs-keyword">import</span> torch | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| <span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>, dtype=torch.float16, device_map=<span class="hljs-string">"auto"</span> | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lnmg1d">Then we load the first adapter:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->peft_model_id = <span class="hljs-string">"alignment-handbook/zephyr-7b-sft-lora"</span> | |
| model = PeftModel.from_pretrained(base_model, peft_model_id, adapter_name=<span class="hljs-string">"sft"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1n53hd9">Then load a different adapter and merge it with the first one:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->weighted_adapter_name = <span class="hljs-string">"sft-dpo"</span> | |
| model.load_adapter(<span class="hljs-string">"alignment-handbook/zephyr-7b-dpo-lora"</span>, adapter_name=<span class="hljs-string">"dpo"</span>) | |
| model.add_weighted_adapter( | |
| adapters=[<span class="hljs-string">"sft"</span>, <span class="hljs-string">"dpo"</span>], | |
| weights=[<span class="hljs-number">0.7</span>, <span class="hljs-number">0.3</span>], | |
| adapter_name=weighted_adapter_name, | |
| combination_type=<span class="hljs-string">"linear"</span> | |
| ) | |
| model.set_adapter(weighted_adapter_name)<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-18gsis4"><p>There are several supported methods for <code>combination_type</code>. Refer to the <a href="../package_reference/lora#peft.LoraModel.add_weighted_adapter">documentation</a> for more details. Note that “svd” as the <code>combination_type</code> is not supported when using <code>torch.float16</code> or <code>torch.bfloat16</code> as the datatype.</p></blockquote> <p data-svelte-h="svelte-qwbdkg">Now, perform inference:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->device = torch.accelerator.current_accelerator().<span class="hljs-built_in">type</span> <span class="hljs-keyword">if</span> <span class="hljs-built_in">hasattr</span>(torch, <span class="hljs-string">"accelerator"</span>) <span class="hljs-keyword">else</span> <span class="hljs-string">"cuda"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>) | |
| prompt = <span class="hljs-string">"Hey, are you conscious? Can you talk to me?"</span> | |
| inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>) | |
| inputs = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> inputs.items()} | |
| <span class="hljs-keyword">with</span> torch.no_grad(): | |
| generate_ids = model.generate(**inputs, max_length=<span class="hljs-number">30</span>) | |
| outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=<span class="hljs-literal">True</span>, clean_up_tokenization_spaces=<span class="hljs-literal">False</span>)[<span class="hljs-number">0</span>] | |
| <span class="hljs-built_in">print</span>(outputs)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="recovering-base-model-performance-via-intruder-dimension-reduction" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#recovering-base-model-performance-via-intruder-dimension-reduction"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Recovering base model performance via intruder dimension reduction</span></h3> <p data-svelte-h="svelte-25a5y0">The paper <a href="https://huggingface.co/papers/2410.21228" rel="nofollow">LoRA vs Full Fine-tuning: An Illusion of Equivalence</a> argues | |
| that LoRA training introduces extra dimensions into the weights that have very little in common with the already | |
| learnt weights and lead to forgetting of already learned information. PEFT implements the suggested mitigation | |
| in <a href="/docs/peft/pr_3207/en/package_reference/lora#peft.tuners.lora.intruders.reduce_intruder_dimension">peft.tuners.lora.intruders.reduce_intruder_dimension()</a>.</p> <p data-svelte-h="svelte-1tr6wvs">The mitigation will take a PEFT model with a loaded LoRA and create a new, modified adapter that is loaded alongside | |
| the existing adapter and now the active adapter.</p> <p data-svelte-h="svelte-1ni337v">Example usage:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft.tuners.lora.intruders <span class="hljs-keyword">import</span> reduce_intruder_dimension | |
| peft_model = AutoPeftModelForCausalLM.from_pretrained(<span class="hljs-string">'hubnemo/llama-3.2b-metamathqa-lora64'</span>) | |
| reduce_intruder_dimension( | |
| peft_model, | |
| mitigation_lambda=<span class="hljs-number">0.75</span>, | |
| ) | |
| peft_model.generate(...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-106qof5">There are a few hyper-parameters that can be used for tuning the effectiveness of the mitigation but, as evidenced | |
| in Figure 8 of the paper, it will always be a trade-off between task accuracy learned by the adapter and forgetting | |
| of the base model’s knowledge. The mitigation will remove information from the adapter to reduce the impact on | |
| forgetting previous knowledge but this also means that some information about the task learned by the adapter is | |
| lost as well.</p> <p data-svelte-h="svelte-smif4x">While the defaults are set to deliver a good trade-off between the two factors it is not guaranteed that the defaults | |
| will hold for your adapter, your model and your data, therefore it is wise to have a benchmark ready to measure | |
| the effect.</p> <h2 class="relative group"><a id="load-adapters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#load-adapters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Load adapters</span></h2> <p data-svelte-h="svelte-7ekd1">Adapters can be loaded onto a pretrained model with <a href="/docs/peft/pr_3207/en/package_reference/peft_model#peft.PeftModel.load_adapter">load_adapter()</a>, which is useful for trying out different adapters whose weights aren’t merged. Set the active adapter weights with the <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.set_adapter">set_adapter()</a> function.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"mistralai/Mistral-7B-v0.1"</span>) | |
| peft_model_id = <span class="hljs-string">"alignment-handbook/zephyr-7b-sft-lora"</span> | |
| model = PeftModel.from_pretrained(base_model, peft_model_id) | |
| <span class="hljs-comment"># load different adapter</span> | |
| model.load_adapter(<span class="hljs-string">"alignment-handbook/zephyr-7b-dpo-lora"</span>, adapter_name=<span class="hljs-string">"dpo"</span>) | |
| <span class="hljs-comment"># set adapter as active</span> | |
| model.set_adapter(<span class="hljs-string">"dpo"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tis9af">To return the base model, you could use <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.unload">unload()</a> to unload all of the LoRA modules or <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.delete_adapter">delete_adapter()</a> to delete the adapter entirely. <a href="/docs/peft/pr_3207/en/package_reference/tuners#peft.tuners.tuners_utils.BaseTuner.unload">unload()</a> is not an in-place operation, remember to assign the returned model to a variable and use it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># unload adapter</span> | |
| model = model.unload() | |
| <span class="hljs-comment"># delete adapter</span> | |
| model.delete_adapter(<span class="hljs-string">"dpo"</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="tensor-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tensor-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tensor Parallelism</span></h2> <p data-svelte-h="svelte-1nopa8x">LoRA supports <a href="https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism" rel="nofollow">Tensor Parallelism (TP)</a> as provided by Transformers. When a base model is loaded with a <code>tp_plan</code>, PEFT automatically detects the TP configuration of each target module and adds the appropriate hooks to the LoRA adapter weights so that they participate correctly in the tensor-parallel computation.</p> <blockquote class="warning" data-svelte-h="svelte-1xfrf8m"><p>Tensor Parallelism support for LoRA requires <code>transformers >= 5.4.0</code>.</p></blockquote> <p data-svelte-h="svelte-13q6dws">Usage is identical to the standard LoRA workflow — simply load the base model with a <code>tp_plan</code> before wrapping it with PEFT:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> get_peft_model, LoraConfig | |
| model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"meta-llama/Llama-3.1-8B"</span>, tp_plan=<span class="hljs-string">"auto"</span>) | |
| lora_config = LoraConfig(r=<span class="hljs-number">16</span>, target_modules=[<span class="hljs-string">"q_proj"</span>, <span class="hljs-string">"v_proj"</span>]) | |
| model = get_peft_model(model, lora_config)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vz807h">Saving and loading work as usual via <code>save_pretrained</code> / <code>from_pretrained</code>. PEFT gathers the sharded adapter weights back to full tensors before saving, so checkpoints are portable and independent of the number of devices used during training.</p> <h2 class="relative group"><a id="inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference</span></h2> <p data-svelte-h="svelte-okuddl">This section showcases what you can do during inference time with LoRA, such as uncoupling the adapter.</p> <h3 class="relative group"><a id="activated-lora-alora" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#activated-lora-alora"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Activated LoRA (aLoRA)</span></h3> <p data-svelte-h="svelte-6ttvul">Activated LoRA (aLoRA) is a low rank adapter architecture for causal LMs that reuses the existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see the aLoRA <a href="https://huggingface.co/papers/2504.12397" rel="nofollow">paper</a>.</p> <p data-svelte-h="svelte-4gntqu">This technique scans for the last occurrence of an invocation sequence (<code>alora_invocation_tokens</code>) in each input (this can be as short as 1 token). It activates the adapter weights on tokens starting with the beginning of the invocation sequence. Any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights. Weights on prior tokens are left un-adapted, making the cache for those tokens interchangeable with base model cache due to the causal attention mask in causal LMs. Usage is very similar to standard LoRA. The key difference is that the invocation sequence must be specified when the adapter is created:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> LoraConfig | |
| config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type=<span class="hljs-string">"CAUSAL_LM"</span>, ...)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14bh0mw">alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->invocation_string = <span class="hljs-string">"placeholder"</span> | |
| alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=<span class="hljs-literal">False</span>).<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x4zjl4">The tokenizer is the base model’s tokenizer. Use <code>add_special_tokens=False</code> to avoid adding <code>SOS</code>/<code>EOS</code> tokens in our search string (which will most likely cause the search to fail).</p> <p data-svelte-h="svelte-1dbt04g"><strong>Notes</strong></p> <ul data-svelte-h="svelte-1hqtaeg"><li>aLoRA is only supported for <code>task_type=CAUSAL_LM</code> tasks due to its focus on cache reuse.</li> <li>Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (<code>r</code>) than LoRA. <code>r=32</code> can be a good starting point.</li> <li>aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors.</li> <li>Beam search is not yet supported.</li> <li>It is generally not recommended to add new tokens to the tokenizer that are not present in the base model. This can complicate the target use case of both the base model and adapter model operating on overlapping context. You can workaround this by adding <a href="../package_reference/trainable_tokens">trainable tokens</a> to the base model prior to training the adapter.</li></ul> <h4 class="relative group"><a id="choice-of-invocation-sequence-and-sft-design" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#choice-of-invocation-sequence-and-sft-design"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Choice of invocation sequence and SFT design</span></h4> <p data-svelte-h="svelte-1pb48d2">You must add the <code>alora_invocation_tokens</code> sequence because it is not added automatically. We recommend activating the adapter weights early (at the start of any adapter-specific prompting), but after any long inputs, to maximize model performance without compromising cache reuse. As with any model, | |
| formatting should be consistent between train and test.</p> <p data-svelte-h="svelte-149wxvq">Consider the following example, where the base model has a chat template, | |
| and the goal is to train the adapter to generate a desired output.</p> <ul data-svelte-h="svelte-1rhx7b3"><li>Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the <code>assistant</code> prompt, then the chat template’s <code>assistant</code> prompt (e.g. <code><|start_of_role|>assistant<|end_of_role|></code>) is a natural choice for the invocation string. See the model’s chat template to find the prompt for the model.</li> <li>Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a <code>user</code> turn immediately prior to the generation, then the chat template’s <code>user</code> prompt (e.g. <code><|start_of_role|>user<|end_of_role|></code>) is a natural choice for the invocation string.</li></ul> <p data-svelte-h="svelte-15948xz">After deciding on an invocation string, get the model tokenizer and obtain <code>alora_invocation_tokens</code> as</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=<span class="hljs-literal">False</span>).<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n95elu">An example inference setup is at <a href="https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py" rel="nofollow">alora finetuning</a>.</p> <blockquote class="note" data-svelte-h="svelte-39vmv"><p>If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries.</p></blockquote> <p data-svelte-h="svelte-qgp1p0">To see why, imagine that ‘a’, ‘b’, ‘c’, and ‘ab’ are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is “abc”. Because “ab” is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string “bc” being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters.</p> <h4 class="relative group"><a id="using-and-reusing-cache-for-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-and-reusing-cache-for-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using (and reusing) cache for generation</span></h4> <p data-svelte-h="svelte-noxq52">The main purpose of aLoRA is to make KV cache interchangeable between the base model and aLoRA adapter models <strong>prior to the invocation sequence</strong> since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns:</p> <ol data-svelte-h="svelte-1ptx1lp"><li>The base model has generated something, and an aLoRA adapter is then called to do a follow-up generation. For example, the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response.</li> <li>An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a follow-up generation where there is partial context overlap with the original aLoRA. For example, the user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, aLoRA checks if these documents are relevant to the question, and then the base model generates an answer.</li></ol> <p data-svelte-h="svelte-xfbbvr">To demonstrate the above behaviors when using caching, we’re using <a href="https://huggingface.co/docs/transformers/en/kv_cache" rel="nofollow">DynamicCache</a> from <code>transformers</code>. Take care to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2).</p> <p data-svelte-h="svelte-1qt6a9e"><strong>Pattern 1: Base model followed by aLoRA</strong> Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers import DynamicCache | |
| <span class="hljs-built_in">..</span>. | |
| cache = DynamicCache() | |
| inputs_base = tokenizer(prompt_base, <span class="hljs-attribute">return_tensors</span>=<span class="hljs-string">"pt"</span>) | |
| <span class="hljs-comment"># Generate from base model and save cache</span> | |
| with model_alora.disable_adapter(): | |
| output = model_alora.generate(inputs_base[<span class="hljs-string">"input_ids"</span>].<span class="hljs-keyword">to</span>(device),<span class="hljs-attribute">attention_mask</span>=inputs_base[<span class="hljs-string">"attention_mask"</span>].<span class="hljs-keyword">to</span>(device),past_key_values = cache,<span class="hljs-attribute">return_dict_in_generate</span>=<span class="hljs-literal">True</span>) | |
| output_text_base = tokenizer.decode(output.sequences[0]) | |
| cache = output.past_key_values | |
| <span class="hljs-comment"># Generate with aLoRA adapter from cache</span> | |
| prompt_alora = output_text + INVOCATION_STRING | |
| inputs_alora = tokenizer(prompt_alora, <span class="hljs-attribute">return_tensors</span>=<span class="hljs-string">"pt"</span>).to(device) | |
| output = model_alora.generate(**inputs_alora, <span class="hljs-attribute">past_key_values</span>=cache) | |
| output_text_alora = tokenizer.decode(output[0]) | |
| <span class="hljs-comment"># <span class="hljs-doctag">Note:</span> cache is now tainted with adapter values and cannot be used in base model from here on!</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e9wr3d"><strong>Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap</strong> Here, we prefill the shared context using the base model, and then generate.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers import DynamicCache | |
| import copy | |
| <span class="hljs-built_in">..</span>. | |
| cache = DynamicCache() | |
| inputs_shared = tokenizer(prompt_shared, <span class="hljs-attribute">return_tensors</span>=<span class="hljs-string">"pt"</span>).to(device) | |
| <span class="hljs-comment"># Prefill from base model and save cache</span> | |
| with model_alora.disable_adapter(): | |
| with torch.no_grad(): | |
| model_alora(**inputs_shared, <span class="hljs-attribute">past_key_values</span>=cache) | |
| cache_copy = copy.deepcopy(cache) | |
| <span class="hljs-comment"># Generate from aLoRA using prefilled cache</span> | |
| prompt_alora = prompt_shared + INVOCATION_STRING | |
| inputs_alora = tokenizer(prompt_alora, <span class="hljs-attribute">return_tensors</span>=<span class="hljs-string">"pt"</span>).to(device) | |
| output = model_alora.generate(**inputs_alora, <span class="hljs-attribute">past_key_values</span>=cache) | |
| output_text_alora = tokenizer.decode(output[0]) | |
| <span class="hljs-comment"># Generate from base model using saved cache not tainted by aLoRA KV values</span> | |
| prompt_base = prompt_shared | |
| inputs_base = tokenizer(prompt_base, <span class="hljs-attribute">return_tensors</span>=<span class="hljs-string">"pt"</span>).to(device) | |
| with model_alora.disable_adapter(): | |
| output = model_alora.generate(**inputs_base, <span class="hljs-attribute">past_key_values</span>=cache_copy) | |
| output_text_base = tokenizer.decode(output[0])<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="inference-with-different-lora-adapters-in-the-same-batch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference-with-different-lora-adapters-in-the-same-batch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference with different LoRA adapters in the same batch</span></h3> <p data-svelte-h="svelte-1fq5tpu">Normally, each inference batch has to use the same adapter(s) in PEFT. This can sometimes be annoying, because we may have batches that contain samples intended to be used with different LoRA adapters. For example, we could have a base model that works well in English and two more LoRA adapters, one for French and one for German. Usually, we would have to split our batches such that each batch only contains samples of one of the languages, we cannot combine different languages in the same batch.</p> <p data-svelte-h="svelte-lcywla">Thankfully, it is possible to mix different LoRA adapters in the same batch using the <code>adapter_name</code> argument. Below, we show an example of how this works in practice. First, let’s load the base model, English, and the two adapters, French and German, like this:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM | |
| <span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> PeftModel | |
| model_id = ... | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained(model_id) | |
| <span class="hljs-comment"># load the LoRA adapter for French</span> | |
| peft_model = PeftModel.from_pretrained(model, <path>, adapter_name=<span class="hljs-string">"adapter_fr"</span>) | |
| <span class="hljs-comment"># next, load the LoRA adapter for German</span> | |
| peft_model.load_adapter(<path>, adapter_name=<span class="hljs-string">"adapter_de"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6wkyom">Now, we want to generate text on a sample that contains all three languages: The first three samples are in English, the next three are in French, and the last three are in German. We can use the <code>adapter_names</code> argument to specify which adapter to use for each sample. Since our base model is used for English, we use the special string <code>"__base__"</code> for these samples. For the next three samples, we indicate the adapter name of the French LoRA fine-tune, in this case <code>"adapter_fr"</code>. For the last three samples, we indicate the adapter name of the German LoRA fine-tune, in this case <code>"adapter_de"</code>. This way, we can use the base model and the two adapters in a single batch.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->inputs = tokenizer( | |
| [ | |
| <span class="hljs-string">"Hello, my dog is cute"</span>, | |
| <span class="hljs-string">"Hello, my cat is awesome"</span>, | |
| <span class="hljs-string">"Hello, my fish is great"</span>, | |
| <span class="hljs-string">"Salut, mon chien est mignon"</span>, | |
| <span class="hljs-string">"Salut, mon chat est génial"</span>, | |
| <span class="hljs-string">"Salut, mon poisson est super"</span>, | |
| <span class="hljs-string">"Hallo, mein Hund ist süß"</span>, | |
| <span class="hljs-string">"Hallo, meine Katze ist toll"</span>, | |
| <span class="hljs-string">"Hallo, mein Fisch ist großartig"</span>, | |
| ], | |
| return_tensors=<span class="hljs-string">"pt"</span>, | |
| padding=<span class="hljs-literal">True</span>, | |
| ) | |
| adapter_names = [ | |
| <span class="hljs-string">"__base__"</span>, <span class="hljs-string">"__base__"</span>, <span class="hljs-string">"__base__"</span>, | |
| <span class="hljs-string">"adapter_fr"</span>, <span class="hljs-string">"adapter_fr"</span>, <span class="hljs-string">"adapter_fr"</span>, | |
| <span class="hljs-string">"adapter_de"</span>, <span class="hljs-string">"adapter_de"</span>, <span class="hljs-string">"adapter_de"</span>, | |
| ] | |
| output = peft_model.generate(**inputs, adapter_names=adapter_names, max_new_tokens=<span class="hljs-number">20</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1o40ch6">Note that the order does not matter here, i.e. the samples in the batch don’t need to be grouped by adapter as in the example above. We just need to ensure that the <code>adapter_names</code> argument is aligned correctly with the samples.</p> <p data-svelte-h="svelte-mqf2jo">Additionally, the same approach also works with the <code>modules_to_save</code> feature, which allows for saving and reusing specific neural network layers, such as custom heads for classification tasks, across different LoRA adapters.</p> <h4 class="relative group"><a id="caveats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#caveats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Caveats</span></h4> <p data-svelte-h="svelte-e0xhf6">Using this feature has some drawbacks, namely:</p> <ul data-svelte-h="svelte-9zbcsh"><li>It only works for inference, not for training.</li> <li>Disabling adapters using the <code>with model.disable_adapter()</code> context takes precedence over <code>adapter_names</code>.</li> <li>You cannot pass <code>adapter_names</code> when some adapter weights were merged with base weight using the <code>merge_adapter</code> method. Please unmerge all adapters first by calling <code>model.unmerge_adapter()</code>.</li> <li>For obvious reasons, this cannot be used after calling <code>merge_and_unload()</code>, since all the LoRA adapters will be merged into the base weights in this case.</li> <li>This feature does not currently work with DoRA, so set <code>use_dora=False</code> in your <code>LoraConfig</code> if you want to use it.</li> <li>The <code>modules_to_save</code> feature is currently only supported for the layers of types <code>Linear</code>, <code>Embedding</code>, <code>Conv2d</code> and <code>Conv1d</code>.</li> <li>There is an expected overhead for inference with <code>adapter_names</code>, especially if the amount of different adapters in the batch is high. This is because the batch size is effectively reduced to the number of samples per adapter. If runtime performance is your top priority, try the following: | |
| <ul><li>Increase the batch size.</li> <li>Try to avoid having a large number of different adapters in the same batch, prefer homogeneous batches. This can be achieved by buffering samples with the same adapter and only perform inference with a small handful of different adapters.</li> <li>Take a look at alternative implementations such as <a href="https://github.com/predibase/lorax" rel="nofollow">LoRAX</a>, <a href="https://github.com/punica-ai/punica" rel="nofollow">punica</a>, or <a href="https://github.com/S-LoRA/S-LoRA" rel="nofollow">S-LoRA</a>, which are specialized to work with a large number of different adapters.</li></ul></li></ul> <h3 class="relative group"><a id="composing-and-reusing-lora-adapters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#composing-and-reusing-lora-adapters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Composing and Reusing LoRA Adapters</span></h3> <h4 class="relative group"><a id="arrow" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#arrow"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Arrow</span></h4> <p data-svelte-h="svelte-1qlwkok"><a href="https://huggingface.co/papers/2405.11157" rel="nofollow">Arrow</a> is a modular routing algorithm designed to combine multiple pre-trained task-specific LoRA adapters to solve a given task. Rather than merging all adapters naively, Arrow introduces a <strong>gradient-free, token-wise mixture-of-experts (MoE) routing mechanism</strong>. At inference time, it first computes a <em>prototype</em> for each LoRA by extracting the top right singular vector from its SVD decomposition. Each token representation is then compared to these prototypes via cosine similarity to obtain routing coefficients. Tokens are assigned to the top-k most relevant LoRA adapters, with the coefficients normalized through softmax, and their outputs linearly combined. This allows effective reuse of existing LoRA modules for new tasks and leads to stronger zero-shot generalization.</p> <p data-svelte-h="svelte-c3e0m7">In PEFT, Arrow is enabled through [`ArrowConfig]<code>and</code>create_arrow_model<code>. You can also configure parameters such as </code>top_k<code>(the number of LoRA adapters combined per token),</code>router_temperature<code>(the softmax temperature applied to the routing coefficients), and</code>rng_seed` (for reproducibility).</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> create_arrow_model, ArrowConfig | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-comment"># Loading the model</span> | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"microsoft/Phi-3-mini-4k-instruct"</span>) | |
| <span class="hljs-comment"># Creating the Arrow config</span> | |
| arrow_config = ArrowConfig( | |
| top_k=<span class="hljs-number">3</span>, | |
| router_temperature=<span class="hljs-number">1.0</span>, | |
| rng_seed=<span class="hljs-number">42</span>, | |
| ) | |
| <span class="hljs-comment"># The LoRA adapters below were trained on a clustered FLAN dataset.</span> | |
| <span class="hljs-comment"># Task clustering was performed using the Model-Based Clustering (MBC) method,</span> | |
| <span class="hljs-comment"># as described in the Arrow paper.</span> | |
| <span class="hljs-comment"># While one could train a separate LoRA for each task and let Arrow route tokens among them,</span> | |
| <span class="hljs-comment"># training LoRAs on clusters of tasks instead provides an indirect optimization for</span> | |
| <span class="hljs-comment"># transfer across the multi-task dataset.</span> | |
| task_specific_adapter_paths = [ | |
| <span class="hljs-string">f"TahaBa/phi3-mini-clustered-flan/ts_expert_<span class="hljs-subst">{i}</span>"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">10</span>) | |
| ] | |
| <span class="hljs-comment"># Creating the Arrow model</span> | |
| model = create_arrow_model( | |
| base_model=base_model, | |
| task_specific_adapter_paths=task_specific_adapter_paths, | |
| arrow_config=arrow_config, | |
| ) | |
| <span class="hljs-comment"># Now the forward path could be called on this model, like a normal PeftModel.</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-13ney51">Furthermore, you can add or remove adapters after calling <code>create_arrow_model</code>—for example, to fine-tune a new adapter or discard an unnecessary one. Once the adapters are in place, you can activate the <code>"arrow_router"</code> for inference to use Arrow. Note that if you add a new LoRA adapter after <code>create_arrow_model</code> and want to fine-tune it, you must explicitly set the new adapter as active, since <code>"arrow_router"</code> is activated by default in <code>create_arrow_model</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> SFTTrainer, SFTConfig | |
| <span class="hljs-comment"># Adding a new adapter and activating it</span> | |
| model.add_adapter(adapter_name=<span class="hljs-string">'new_adapter'</span>) | |
| model.set_adapter(<span class="hljs-string">'new_adapter'</span>) | |
| <span class="hljs-comment"># Now the model could be trained along the `new_adapter`.</span> | |
| trainer = SFTTrainer( | |
| model=model, | |
| args=SFTConfig(...), | |
| ... | |
| ) | |
| <span class="hljs-comment"># Once the training is done, you can activate `arrow_router` and use it in inference</span> | |
| model.set_adapter(<span class="hljs-string">'arrow_router'</span>) <span class="hljs-comment"># Model is ready to be used at inference time now</span><!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="genknowsub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#genknowsub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>GenKnowSub</span></h4> <p data-svelte-h="svelte-1damsga"><a href="https://aclanthology.org/2025.acl-short.54/" rel="nofollow">GenKnowSub</a> augments Arrow by purifying task-specific LoRA adapters before routing. The key idea is to subtract general knowledge encoded in LoRA space—based on the <a href="https://huggingface.co/papers/2212.04089" rel="nofollow">forgetting-via-negation principle</a>—so that task adapters become more isolated and focused on task-relevant signals. Concretely, GenKnowSub estimates a low-dimensional “general” subspace from a set of general (non task-specific) LoRA adapters and removes this component from each task adapter’s LoRA update prior to Arrow’s token-wise routing. This typically improves compositionality and reduces interference when combining many task adapters.</p> <p data-svelte-h="svelte-bisu0r">In PEFT, enable GenKnowSub by setting <code>use_gks=True</code> in ArrowConfig, and providing <code>general_adapter_paths</code> in <code>create_arrow_model</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> peft <span class="hljs-keyword">import</span> create_arrow_model, ArrowConfig | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM | |
| <span class="hljs-comment"># Loading the model</span> | |
| base_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"microsoft/Phi-3-mini-4k-instruct"</span>) | |
| <span class="hljs-comment"># Creating the Arrow config</span> | |
| arrow_config = ArrowConfig( | |
| top_k=<span class="hljs-number">3</span>, | |
| router_temperature=<span class="hljs-number">1.0</span>, | |
| use_gks=<span class="hljs-literal">True</span>, | |
| rng_seed=<span class="hljs-number">42</span>, | |
| ) | |
| <span class="hljs-comment"># Path to task-specific, trained on flan clustered dataset (as we explained before.)</span> | |
| task_specific_adapter_paths = [ | |
| <span class="hljs-string">f"TahaBa/phi3-mini-clustered-flan/ts_expert_<span class="hljs-subst">{i}</span>"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">10</span>) | |
| ] | |
| <span class="hljs-comment"># These general adapters are trained on English, German, and French Wikipedia dataset,</span> | |
| <span class="hljs-comment"># with causal language modelling objective, each pair like: (507 token tsentence, 5 token completion), and the loss computed on the completion</span> | |
| general_adapter_paths = [ | |
| <span class="hljs-string">"TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langen/checkpoint-17"</span>, | |
| <span class="hljs-string">"TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langfr/checkpoint-35"</span>, | |
| <span class="hljs-string">"TahaBa/phi3-mini-general-adapters/cluster0_batch16_prop1.0_langger/checkpoint-17"</span> | |
| ] | |
| <span class="hljs-comment"># Creating the Arrow model</span> | |
| model = create_arrow_model( | |
| base_model=base_model, | |
| task_specific_adapter_paths=task_specific_adapter_paths, | |
| general_adapter_paths=general_adapter_paths, | |
| arrow_config=arrow_config, | |
| ) | |
| <span class="hljs-comment"># Now the forward path could be called on this model, like a normal PeftModel.</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19fqfkj">To encode general knowledge, GenKnowSub subtracts the average of the provided general adapters from each task-specific adapter once, before routing begins. Furthermore, the ability to add or remove adapters after calling <code>create_arrow_model</code> (as described in the Arrow section) is still supported in this case.</p> <blockquote class="tip"><p data-svelte-h="svelte-14xydvq"><strong>Things to keep in mind when using Arrow + GenKnowSub:</strong></p> <ul><li data-svelte-h="svelte-1g552yx"><p>All LoRA adapters (task-specific and general) must share the same <code>rank</code> and <code>target_modules</code>.</p></li> <li data-svelte-h="svelte-1kqmy9e"><p>Any inconsistency in these settings will raise an error in <code>create_arrow_model</code>.</p></li> <li data-svelte-h="svelte-o6l66k"><p>Having different scaling factors (<code>lora_alpha</code>) across task adapters is supported — Arrow handles them automatically.</p></li> <li data-svelte-h="svelte-3a8u10"><p>Merging the <code>"arrow_router"</code> is not supported, due to its dynamic routing behavior.</p></li> <li data-svelte-h="svelte-1f7byj"><p>In create_arrow_model, task adapters are loaded as <code>task_i</code> and general adapters as <code>gks_j</code> (where <code>i</code> and <code>j</code> are indices). The function ensures consistency of <code>target_modules</code>, <code>rank</code>, and whether adapters are applied to <code>Linear</code> or <code>Linear4bit</code> layers. It then adds the <code>"arrow_router"</code> module and activates it. Any customization of this process requires overriding <code>create_arrow_model</code>.</p></li> <li><p data-svelte-h="svelte-1j9rbu8">This implementation is compatible with 4-bit quantization (via bitsandbytes):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, BitsAndBytesConfig | |
| <span class="hljs-keyword">import</span> torch | |
| <span class="hljs-comment"># Quantisation config</span> | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=<span class="hljs-literal">True</span>, | |
| bnb_4bit_quant_type=<span class="hljs-string">"nf4"</span>, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=<span class="hljs-literal">False</span>, | |
| ) | |
| <span class="hljs-comment"># Loading the model</span> | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| <span class="hljs-string">"microsoft/Phi-3-mini-4k-instruct"</span>, | |
| dtype=torch.bfloat16, | |
| device_map=<span class="hljs-string">"auto"</span>, | |
| quantization_config=bnb_config, | |
| ) | |
| <span class="hljs-comment"># Now call create_arrow_model() as we explained before.</span><!-- HTML_TAG_END --></pre></div></li></ul></blockquote> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/peft/blob/main/docs/source/developer_guides/lora.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1wz1iqk = { | |
| assets: "/docs/peft/pr_3207/en", | |
| base: "/docs/peft/pr_3207/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/peft/pr_3207/en/_app/immutable/entry/start.dbacf4b8.js"), | |
| import("/docs/peft/pr_3207/en/_app/immutable/entry/app.cb03e0d4.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 11], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 200 kB
- Xet hash:
- 8556852f5404210456107d84e4b1fb29f2f08cbf6ea72dcdee1cc5e2f261a5f4
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.