Buckets:

hf-doc-build/doc-dev / transformers /pr_36049 /en /modular_transformers.html
rtrm's picture
download
raw
124 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Modular transformers&quot;,&quot;local&quot;:&quot;modular-transformers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;What is it?&quot;,&quot;local&quot;:&quot;what-is-it&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Details&quot;,&quot;local&quot;:&quot;details&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Enforcement&quot;,&quot;local&quot;:&quot;enforcement&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Examples&quot;,&quot;local&quot;:&quot;examples&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;What it is not&quot;,&quot;local&quot;:&quot;what-it-is-not&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Real world example breakdown&quot;,&quot;local&quot;:&quot;real-world-example-breakdown&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Config class&quot;,&quot;local&quot;:&quot;config-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Norm class&quot;,&quot;local&quot;:&quot;norm-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Attention class&quot;,&quot;local&quot;:&quot;attention-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The DecoderLayer class&quot;,&quot;local&quot;:&quot;the-decoderlayer-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The Model class&quot;,&quot;local&quot;:&quot;the-model-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Finally… The ForCausalLM class&quot;,&quot;local&quot;:&quot;finally-the-forcausallm-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Advanced usage&quot;,&quot;local&quot;:&quot;advanced-usage&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Removing attributes which are not just assignments&quot;,&quot;local&quot;:&quot;removing-attributes-which-are-not-just-assignments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Avoiding super() special meaning&quot;,&quot;local&quot;:&quot;avoiding-super-special-meaning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Deleting unused methods&quot;,&quot;local&quot;:&quot;deleting-unused-methods&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Define new functions&quot;,&quot;local&quot;:&quot;define-new-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Decorators&quot;,&quot;local&quot;:&quot;decorators&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The super_kwargs special case&quot;,&quot;local&quot;:&quot;the-superkwargs-special-case&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The DOCSTRING variables&quot;,&quot;local&quot;:&quot;the-docstring-variables&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Limitations&quot;,&quot;local&quot;:&quot;limitations&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Special naming (essentially for multimodal models)&quot;,&quot;local&quot;:&quot;special-naming-essentially-for-multimodal-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Automatic docstrings issue (mostly for Configs)&quot;,&quot;local&quot;:&quot;automatic-docstrings-issue-mostly-for-configs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/pr_36049/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/entry/start.86af8b85.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/scheduler.25b97de1.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/singletons.20f80512.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/index.e188933d.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/paths.162096ab.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/entry/app.d602e208.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/index.d9030fc9.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/nodes/0.8e0a4db0.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/nodes/393.10819250.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Modular transformers&quot;,&quot;local&quot;:&quot;modular-transformers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;What is it?&quot;,&quot;local&quot;:&quot;what-is-it&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Details&quot;,&quot;local&quot;:&quot;details&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Enforcement&quot;,&quot;local&quot;:&quot;enforcement&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Examples&quot;,&quot;local&quot;:&quot;examples&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;What it is not&quot;,&quot;local&quot;:&quot;what-it-is-not&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Real world example breakdown&quot;,&quot;local&quot;:&quot;real-world-example-breakdown&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Config class&quot;,&quot;local&quot;:&quot;config-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Norm class&quot;,&quot;local&quot;:&quot;norm-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Attention class&quot;,&quot;local&quot;:&quot;attention-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The DecoderLayer class&quot;,&quot;local&quot;:&quot;the-decoderlayer-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The Model class&quot;,&quot;local&quot;:&quot;the-model-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Finally… The ForCausalLM class&quot;,&quot;local&quot;:&quot;finally-the-forcausallm-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Advanced usage&quot;,&quot;local&quot;:&quot;advanced-usage&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Removing attributes which are not just assignments&quot;,&quot;local&quot;:&quot;removing-attributes-which-are-not-just-assignments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Avoiding super() special meaning&quot;,&quot;local&quot;:&quot;avoiding-super-special-meaning&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Deleting unused methods&quot;,&quot;local&quot;:&quot;deleting-unused-methods&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Define new functions&quot;,&quot;local&quot;:&quot;define-new-functions&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Decorators&quot;,&quot;local&quot;:&quot;decorators&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The super_kwargs special case&quot;,&quot;local&quot;:&quot;the-superkwargs-special-case&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;The DOCSTRING variables&quot;,&quot;local&quot;:&quot;the-docstring-variables&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Limitations&quot;,&quot;local&quot;:&quot;limitations&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Special naming (essentially for multimodal models)&quot;,&quot;local&quot;:&quot;special-naming-essentially-for-multimodal-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Automatic docstrings issue (mostly for Configs)&quot;,&quot;local&quot;:&quot;automatic-docstrings-issue-mostly-for-configs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="modular-transformers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#modular-transformers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Modular transformers</span></h1> <p data-svelte-h="svelte-1qsl2az"><code>transformers</code> is an opinionated framework; our philosophy is defined in the following <a href="./philosophy">conceptual guide</a>.</p> <p data-svelte-h="svelte-1qcwnc7">The core of that philosophy is exemplified by the <a href="https://huggingface.co/blog/transformers-design-philosophy" rel="nofollow">single model, single file</a>
aspect of the library. This component’s downside is that it limits the inheritance and importability of components from
files to others in the toolkit.</p> <p data-svelte-h="svelte-hcbv0p">As a result, model components tend to be repeated across many files. There are as many attention layers defined
in <code>transformers</code> as there are models, and a significant number of those are identical to each other.
The unfortunate consequence is that independent implementations tend to diverge as fixes and changes get applied
to specific parts of the code.</p> <p data-svelte-h="svelte-14lpvvv">In order to balance this issue, we introduced the concept of “copies” across the library. By adding a comment indicating
that code is a copy of another, we can enforce through CI and local commands that copies do not diverge. However,
while the complexity is low, this is often quite tedious to do.</p> <p data-svelte-h="svelte-1tehds2">And, finally, this contributes to adding a significant overhead to contributing models which we would like to remove.
This approach often requires model contributions to add modeling code (~1k lines), processor (~500 lines), tests, docs,
etc. Model contribution PRs rarely add less than 3-5k lines of code, with much of this code being boilerplate.</p> <p data-svelte-h="svelte-5u9x31">This raises the bar for contributions, and with Modular Transformers, we’re aiming to lower the bar to a much more
acceptable point.</p> <p data-svelte-h="svelte-c0230n">If you plan to add a model to <code>transformers</code> make sure you read <a href="https://huggingface.co/docs/transformers/add_new_model" rel="nofollow">How to add a model to 🤗 Transformers?</a>.
For any kind of contributions, see <a href="https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md" rel="nofollow">CONTRIBUTING.md</a>.</p> <h2 class="relative group"><a id="what-is-it" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-is-it"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What is it?</span></h2> <p data-svelte-h="svelte-1qqnnub">Modular Transformers introduces the concept of a “modular” file to a model folder. This modular file accepts code
that isn’t typically accepted in modeling/processing files, as it allows importing from neighbouring models as well
as inheritance from classes to others.</p> <p data-svelte-h="svelte-huwsbv">This modular file defines models, processors, and the configuration class that would otherwise be defined in their
respective modules.</p> <p data-svelte-h="svelte-dk7wye">Finally, this feature introduces a new <code>linter</code> which will “unravel” the modular file into the “single model, single
file” directory structure. These files will get auto-generated every time the script is run; reducing the required
contributions to the modular file, and therefore only to the changes between the contributed model and others.</p> <p data-svelte-h="svelte-12pm85e">Model users will end up importing and using the single-file interface, so no change is expected here. Doing this, we
hope to combine the best of both worlds: enabling simple contributions while sticking to our philosophy.</p> <p data-svelte-h="svelte-ah1dr9">This is therefore a replacement for the <code># Copied from</code> markers, and previously contributed models can be expected to
be moved to the new Modular Transformers format in the coming months.</p> <h3 class="relative group"><a id="details" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#details"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Details</span></h3> <p data-svelte-h="svelte-11y95xa">To generate a single file from the modular file, run the following command.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python utils/modular_model_converter.py --files-to-parse src/transformers/models/&lt;your_model&gt;/modular_&lt;your_model&gt;.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xkvkhs">The “linter”, which unravels the inheritance and creates all single-files from the modular file, will flatten the
inheritance while trying to be invisible to Python users. At this time, the linter flattens a <strong>single</strong> level of
inheritance.</p> <p data-svelte-h="svelte-1gkqha7">For example:</p> <ul data-svelte-h="svelte-1np4rww"><li>If a configuration class inherits from another and adds/deletes an argument, the generated file will either directly
reference it (in case of addition) or completely remove it (in case of deletion).</li> <li>If a class inherits from another, for example: <code>class GemmaModel(LlamaModel):</code>, dependencies are automatically
inferred. All submodules will be automatically added from the superclass.</li> <li>If you define new functions in the <code>modular</code> and use them inside classes, the linter will automatically infer the</li></ul> <p data-svelte-h="svelte-12b6859">You should be able to write everything (the tokenizer, the image processor, the model, the config) in this <code>modular</code>
file, and the corresponding files will be created for you.</p> <h3 class="relative group"><a id="enforcement" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#enforcement"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Enforcement</span></h3> <p data-svelte-h="svelte-1jju20g">Run the command below to ensure the generated content matches <code>modular_&lt;your_model&gt;.py</code></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python utils/check_modular_conversion.py --files src/transformers/models/&lt;your_model&gt;/modular_&lt;your_model&gt;.py<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="examples" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#examples"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Examples</span></h3> <p data-svelte-h="svelte-1o8u693">Here is a quick example with BERT and RoBERTa. The two models are intimately related: their modeling implementation
differs solely by a change in the embedding layer.</p> <p data-svelte-h="svelte-si4m91">Instead of redefining the model entirely, here is what the <code>modular_roberta.py</code> file looks like for the modeling &amp;
configuration classes (for the sake of the example, the tokenizer is ignored at this time as very different).</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch <span class="hljs-keyword">import</span> nn
<span class="hljs-keyword">from</span> ..bert.configuration_bert <span class="hljs-keyword">import</span> BertConfig
<span class="hljs-keyword">from</span> ..bert.modeling_bert <span class="hljs-keyword">import</span> (
BertModel,
BertEmbeddings,
BertForMaskedLM
)
<span class="hljs-comment"># The RoBERTa config is identical to BERT&#x27;s config</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">RobertaConfig</span>(<span class="hljs-title class_ inherited__">BertConfig</span>):
model_type = <span class="hljs-string">&#x27;roberta&#x27;</span>
<span class="hljs-comment"># We redefine the embeddings here to highlight the padding ID difference, and we redefine the position embeddings</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">RobertaEmbeddings</span>(<span class="hljs-title class_ inherited__">BertEmbeddings</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config</span>):
<span class="hljs-built_in">super</span>().__init__(config())
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
<span class="hljs-comment"># The RoBERTa model is identical to the BERT model, except for the embedding layer. </span>
<span class="hljs-comment"># We redefine the embeddings above, so here there is no need to do additional work</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">RobertaModel</span>(<span class="hljs-title class_ inherited__">BertModel</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config</span>):
<span class="hljs-built_in">super</span>().__init__(config)
self.embeddings = RobertaEmbeddings(config)
<span class="hljs-comment"># The heads now only need to redefine the model inside to the correct `RobertaModel`</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">RobertaForMaskedLM</span>(<span class="hljs-title class_ inherited__">BertForMaskedLM</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config</span>):
<span class="hljs-built_in">super</span>().__init__(config)
self.model = RobertaModel(config)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="what-it-is-not" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-it-is-not"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What it is not</span></h2> <p data-svelte-h="svelte-bvl6dd">It is not a replacement for the modeling code (yet?), and if your model is not based on anything else that ever existed, then you can add a <code>modeling</code> file as usual. Similarly, if you cannot easily inherit your <code>configuration</code> (or <code>tokenization</code> or <code>processing</code>) file from another model’s similar file, you can add that filetype directly (even though defining it in the modular file would work, it would clutter it).</p> <h2 class="relative group"><a id="real-world-example-breakdown" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#real-world-example-breakdown"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Real world example breakdown</span></h2> <p data-svelte-h="svelte-6v2s2r">As explained, modular allows you to use regular Python inheritance from any other model’s code in the library, in order to define your own. For this reason, it will work better/be easier if you first browse the library a bit to find models close to yours, in order to inherit from them. For example, are you using a sliding window in the <code>Attention</code> class? Then start by checking models that are well known to use it, e.g. <code>Mistral</code>, or <code>Qwen2</code>! Are you using interleaved <code>RotaryEmbedding</code> modules? Check out <code>Cohere</code>, <code>Cohere2</code> and <code>Glm</code> models! Otherwise a very strong starting point is to check out <code>Llama</code>. And if you are doing a bit of all of that at once, then you can mix and match!</p> <p data-svelte-h="svelte-1cop7hb">Here are some common properties that your model might be using, and corresponding modeling files to check as an example:</p> <ul data-svelte-h="svelte-579f4p"><li>Mixture of expert: <code>SwitchTransformers</code> or <code>Mixtral</code></li> <li>Interleaved (and/or partial) rotary embedding: <code>Glm</code>, <code>Phi</code></li> <li>State space models: <ul><li>Hybrid with attention: <code>Jamba</code> , <code>Bamba</code>, <code>Zamba</code></li> <li>Mamba2: <code>Mamba2</code></li></ul></li> <li>Recurrent hidden states: <code>Gemma2</code></li> <li>Different sliding window attention/full attention patterns per layer: <code>Gemma2</code>, <code>Cohere2</code></li> <li>Clipping of QKV: <code>Olmo</code></li> <li>Normalization of QK: <code>Olmo2</code>, <code>Cohere</code></li> <li>Fused QKV (not recommended): <code>Phi3</code></li></ul> <p data-svelte-h="svelte-52k0wz">At Hugging Face, we feel that learning by example is usually (one of) the best way, so we will now go over a typical modular file, and the different features our linter provides (and its limitations)! 🤗 Let’s use a real world example with Olmo2 model, which I feel provides a very good illustration of the modular mechanisms. The original file can be found <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modular_olmo2.py" rel="nofollow">here</a>. For simplicity, we will go over it class by class, and repeat the modular’s definition of ech class. For reference, the modeling and configuration of Olmo (v1) on which we will inherit a lot can be found <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/modeling_olmo.py" rel="nofollow">here</a> and <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo/configuration_olmo.py" rel="nofollow">here</a> respectively. The final modeling of Olmo2 (generated by running our linter on the modular we will describe below) can be found <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py" rel="nofollow">here</a></p> <p data-svelte-h="svelte-33jo7g">Let’s break it down!</p> <h3 class="relative group"><a id="config-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#config-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Config class</span></h3> <p data-svelte-h="svelte-1xf26oy">Here is the <code>Config</code> definition in modular:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..olmo.configuration_olmo <span class="hljs-keyword">import</span> OlmoConfig
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2Config</span>(<span class="hljs-title class_ inherited__">OlmoConfig</span>):
<span class="hljs-string">r&quot;&quot;&quot;
This is the configuration class to store the configuration of a [Olmo2Model](/docs/transformers/pr_36049/en/model_doc/olmo2#transformers.Olmo2Model).
&quot;&quot;&quot;</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">
self,
vocab_size=<span class="hljs-number">50304</span>,
hidden_size=<span class="hljs-number">4096</span>,
intermediate_size=<span class="hljs-number">11008</span>,
num_hidden_layers=<span class="hljs-number">32</span>,
num_attention_heads=<span class="hljs-number">32</span>,
num_key_value_heads=<span class="hljs-literal">None</span>,
hidden_act=<span class="hljs-string">&quot;silu&quot;</span>,
max_position_embeddings=<span class="hljs-number">2048</span>,
initializer_range=<span class="hljs-number">0.02</span>,
use_cache=<span class="hljs-literal">True</span>,
pad_token_id=<span class="hljs-number">1</span>,
bos_token_id=<span class="hljs-literal">None</span>,
eos_token_id=<span class="hljs-number">50279</span>,
tie_word_embeddings=<span class="hljs-literal">False</span>,
rope_theta=<span class="hljs-number">10000.0</span>,
rope_scaling=<span class="hljs-literal">None</span>,
attention_bias=<span class="hljs-literal">False</span>,
attention_dropout=<span class="hljs-number">0.0</span>,
rms_norm_eps=<span class="hljs-number">1e-5</span>,
**kwargs,
</span>):
<span class="hljs-built_in">super</span>().__init__(
vocab_size=vocab_size,
hidden_size=hidden_size,
intermediate_size=intermediate_size,
num_hidden_layers=num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=num_key_value_heads,
hidden_act=hidden_act,
max_position_embeddings=max_position_embeddings,
initializer_range=initializer_range,
use_cache=use_cache,
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
rope_theta=rope_theta,
rope_scaling=rope_scaling,
attention_bias=attention_bias,
attention_dropout=attention_dropout,
**kwargs,
)
self.rms_norm_eps = rms_norm_eps
<span class="hljs-keyword">del</span> self.clip_qkv<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17h1ydr">Here, we correctly identified that the <code>Config</code> in Olmo2 is similar to Olmo’s, up to a few details:</p> <ol data-svelte-h="svelte-wmze1e"><li>The default value of most arguments has changed</li> <li>we have a new argument, <code>rms_norm_eps</code></li> <li>the argument <code>clip_qkv</code> is not used anymore</li></ol> <p data-svelte-h="svelte-18k2jgy">To solve points 1. and 2., simply overwriting the <code>__init__</code> function with the new default arguments and adding the new one is enough, as you would expect when you want to overwrite a method in Python! Of course you also need to assign the new attribute <code>rms_norm_eps</code> to <code>self</code> in the <code>__init__</code>’s body.<br>
For point 3., we use the special syntax <code>del self.clip_qkv</code>, which, has you can expect, removed the assignment of this attribute in the unravelled code (after the conversion with the linter).</p> <p data-svelte-h="svelte-8zw5rz">Now, there is a subtility here: as you can see, we used <code>super().__init__(...)</code>. Usually, in Python, it is simply used to call the parent’s <code>__init__</code>. In modular terms, however, it has a <em>slightly</em> different meaning. When we find a call such as <code>super().my_function(...)</code> in the modular file, the linter will take the body of the <code>my_function</code> function in the parent, and unravel it where the call to <code>super().my_function(...)</code> occured. Then, the <code>del self.clip_qkv</code> statement will remove the reference to <code>self.clip_qkv</code> from the unravelled body. Thus <code>del self.xxx</code> can only work in pair with <code>super().my_function(...)</code>, and should always be placed after it (but you can add whatever you want <em>before</em> calling <code>super()</code>, and it will be placed, as you can expect, before the parent’s body).</p> <h3 class="relative group"><a id="norm-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#norm-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Norm class</span></h3> <p data-svelte-h="svelte-zq0wu8">Here is the <code>Norm</code> class:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..llama.modeling_llama <span class="hljs-keyword">import</span> LlamaRMSNorm
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2RMSNorm</span>(<span class="hljs-title class_ inherited__">LlamaRMSNorm</span>):
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4e6smr">What to say here, it is pretty explicit isn’t it? We do not modify anything from the <code>LlamaRMSNorm</code> definition. Thus the linter will unravel exactly the content of the parent (<code>LlamaRMSNorm</code>). Only change will be that every reference to “llama” on the docstrings, type hints, and comments (basically everywhere) will be changed to references to “olmo2” for consistency!</p> <h3 class="relative group"><a id="attention-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#attention-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Attention class</span></h3> <p data-svelte-h="svelte-m4gli0">Here is the <code>Attention</code> class:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..llama.modeling_llama <span class="hljs-keyword">import</span> eager_attention_forward
<span class="hljs-keyword">from</span> ..olmo.modeling_olmo <span class="hljs-keyword">import</span> OlmoAttention, apply_rotary_pos_emb
<span class="hljs-comment"># Olmo2 attention is identical to OLMo attention except:</span>
<span class="hljs-comment"># - Norm is applied to attention queries and keys.</span>
<span class="hljs-comment"># - No qkv clipping.</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2Attention</span>(<span class="hljs-title class_ inherited__">OlmoAttention</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: Olmo2Config, layer_idx: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">int</span>] = <span class="hljs-literal">None</span></span>):
<span class="hljs-built_in">super</span>().__init__(config, layer_idx=layer_idx)
self.q_norm = Olmo2RMSNorm(config.num_attention_heads * self.head_dim, config.rms_norm_eps)
self.k_norm = Olmo2RMSNorm(config.num_key_value_heads * self.head_dim, config.rms_norm_eps)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">
self,
hidden_states: torch.Tensor,
position_embeddings: <span class="hljs-type">Tuple</span>[torch.Tensor, torch.Tensor],
attention_mask: <span class="hljs-type">Optional</span>[torch.Tensor],
past_key_value: <span class="hljs-type">Optional</span>[Cache] = <span class="hljs-literal">None</span>,
cache_position: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
**kwargs,
</span>) -&gt; <span class="hljs-type">Tuple</span>[torch.Tensor, <span class="hljs-type">Optional</span>[torch.Tensor], <span class="hljs-type">Optional</span>[<span class="hljs-type">Tuple</span>[torch.Tensor]]]:
input_shape = hidden_states.shape[:-<span class="hljs-number">1</span>]
hidden_shape = (*input_shape, -<span class="hljs-number">1</span>, self.head_dim)
query_states = self.q_norm(self.q_proj(hidden_states))
key_states = self.k_norm(self.k_proj(hidden_states))
value_states = self.v_proj(hidden_states)
query_states = query_states.view(hidden_shape).transpose(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>)
key_states = key_states.view(hidden_shape).transpose(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>)
value_states = value_states.view(hidden_shape).transpose(<span class="hljs-number">1</span>, <span class="hljs-number">2</span>)
cos, sin = position_embeddings
query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
<span class="hljs-keyword">if</span> past_key_value <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
<span class="hljs-comment"># sin and cos are specific to RoPE models; cache_position needed for the static cache</span>
cache_kwargs = {<span class="hljs-string">&quot;sin&quot;</span>: sin, <span class="hljs-string">&quot;cos&quot;</span>: cos, <span class="hljs-string">&quot;cache_position&quot;</span>: cache_position}
key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
attention_interface: <span class="hljs-type">Callable</span> = eager_attention_forward
<span class="hljs-keyword">if</span> self.config._attn_implementation != <span class="hljs-string">&quot;eager&quot;</span>:
<span class="hljs-keyword">if</span> self.config._attn_implementation == <span class="hljs-string">&quot;sdpa&quot;</span> <span class="hljs-keyword">and</span> kwargs.get(<span class="hljs-string">&quot;output_attentions&quot;</span>, <span class="hljs-literal">False</span>):
logger.warning_once(
<span class="hljs-string">&quot;`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to &quot;</span>
<span class="hljs-string">&#x27;eager attention. This warning can be removed using the argument `attn_implementation=&quot;eager&quot;` when loading the model.&#x27;</span>
)
<span class="hljs-keyword">else</span>:
attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
attn_output, attn_weights = attention_interface(
self,
query_states,
key_states,
value_states,
attention_mask,
dropout=<span class="hljs-number">0.0</span> <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> self.training <span class="hljs-keyword">else</span> self.attention_dropout,
scaling=self.scaling,
**kwargs,
)
attn_output = attn_output.reshape(*input_shape, -<span class="hljs-number">1</span>).contiguous()
attn_output = self.o_proj(attn_output)
<span class="hljs-keyword">return</span> attn_output, attn_weights<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-f779jp">Now, what’s happening here? In the <code>__init__</code>, we call <code>super().__init__(...)</code>, thus copying the parent’s definition, then add 2 new layers of the <code>Olmo2RMSNorm</code> we just added previously. Indeed, those were not present in the original <code>Olmo</code> (v1) model. So, now, we also have to overwrite the <code>forward</code> method to use these 2 new layers right? Indeed, if you check carefully, the definition of <code>forward</code> is identical to <code>Olmo</code>’s, but we added a pass with the norm layers just before projecting with <code>q_proj</code> and <code>k_proj</code>. However, to help us, we directly imported the functions <code>eager_attention_forward</code> from llama, and <code>apply_rotary_pos_emb</code> from olmo. The linter will then automatically add these imported functions in the final <code>modeling_olmo2.py</code> file, by copying their definitions from the source (imported) files. And it will even add the <code>rotate_half</code> and <code>repeat_kv</code> functions (which are used inside <code>apply_rotary_pos_embed</code> and <code>eager_attention_forward</code> respectively) by figuring out the dependency automatically. Neat, right?<br>
Note that we had to redefine this class, because we did not find any model defining the <code>Attention</code> layer with the added <code>RMSNorm</code> layer anywhere else in the library! Otherwise, we would have simply inherited from this model instead as we did for the <code>RMSNorm</code>!</p> <h3 class="relative group"><a id="the-decoderlayer-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-decoderlayer-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The DecoderLayer class</span></h3> <p data-svelte-h="svelte-8hd4fl">Here is the <code>DecoderLayer</code> class:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..olmo.modeling_olmo <span class="hljs-keyword">import</span> OlmoDecoderLayer
<span class="hljs-comment"># The OLMo2 layers are identical to those of the OLMo model except:</span>
<span class="hljs-comment"># - RMSNorm is used instead of standard layer norm.</span>
<span class="hljs-comment"># - Norm is applied after attention/feedforward rather than before.</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2DecoderLayer</span>(<span class="hljs-title class_ inherited__">OlmoDecoderLayer</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: Olmo2Config, layer_idx: <span class="hljs-built_in">int</span></span>):
<span class="hljs-built_in">super</span>().__init__(config, layer_idx=layer_idx)
self.post_attention_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.post_feedforward_layernorm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.self_attn = Olmo2Attention(config=config, layer_idx=layer_idx)
<span class="hljs-keyword">del</span> self.input_layernorm
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">
self,
hidden_states: torch.Tensor,
attention_mask: <span class="hljs-type">Optional</span>[torch.Tensor] = <span class="hljs-literal">None</span>,
position_ids: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
past_key_value: <span class="hljs-type">Optional</span>[Cache] = <span class="hljs-literal">None</span>,
output_attentions: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">False</span>,
use_cache: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">False</span>,
cache_position: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
position_embeddings: <span class="hljs-type">Optional</span>[<span class="hljs-type">Tuple</span>[torch.Tensor, torch.Tensor]] = <span class="hljs-literal">None</span>, <span class="hljs-comment"># necessary, but kept here for BC</span>
**kwargs,
</span>) -&gt; <span class="hljs-type">Tuple</span>[torch.FloatTensor, <span class="hljs-type">Optional</span>[<span class="hljs-type">Tuple</span>[torch.FloatTensor, torch.FloatTensor]]]:
residual = hidden_states
<span class="hljs-comment"># Self Attention</span>
hidden_states, self_attn_weights = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
position_ids=position_ids,
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
position_embeddings=position_embeddings,
**kwargs,
)
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = residual + hidden_states
<span class="hljs-comment"># Fully Connected</span>
residual = hidden_states
hidden_states = self.mlp(hidden_states)
hidden_states = self.post_feedforward_layernorm(hidden_states)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
<span class="hljs-keyword">if</span> output_attentions:
outputs += (self_attn_weights,)
<span class="hljs-keyword">return</span> outputs<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1puk9ym">At this point, you should start to pick up what is happening for this class. We switched the type of norm in the <code>__init__</code> by overwriting <code>self.post_attention_layernorm</code> after the call to <code>super().__init__(...)</code>, thus going from a <code>LayerNorm</code> in the parent class, to our <code>RMSNorm</code> in this class. Then we simply deleted the <code>self.input_layernorm</code> attribute, and replaced it by <code>self.post_feedforward_layernorm</code>, because the name was not making sense anymore as we apply it after in <code>Olmo2</code> instead of before in <code>Olmo</code>. For this reason, we also need to overwrite the <code>forward</code> method, to reflect the logic change.</p> <p data-svelte-h="svelte-1jvwg68">Note however that if we had only switched <code>self.post_attention_layernorm</code> and <code>self.input_layernorm</code> from <code>LayerNorm</code>s to <code>RMSNorm</code>s (without the name and logic change of <code>elf.input_layernorm</code>), we would not have had to redefine the <code>forward</code> method!</p> <h3 class="relative group"><a id="the-model-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-model-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Model class</span></h3> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..olmo.modeling_olmo <span class="hljs-keyword">import</span> OlmoModel
<span class="hljs-comment"># The OLMo2 model is identical to the OLMo model, except RMSNorm is used instead of</span>
<span class="hljs-comment"># standard layer norm for the output norm.</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2Model</span>(<span class="hljs-title class_ inherited__">OlmoModel</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: Olmo2Config</span>):
<span class="hljs-built_in">super</span>().__init__(config)
self.norm = Olmo2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.layers = nn.ModuleList(
[Olmo2DecoderLayer(config, layer_idx) <span class="hljs-keyword">for</span> layer_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(config.num_hidden_layers)]
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1upff2z">Here, this is exactly what I was pointing out before: we simply change the <em>type</em> of the <code>self.norm</code> attribute (going from <code>LayerNorn</code> in <code>Olmo</code> to <code>RMSNorm</code> in <code>Olmo2</code>). Since this change does not reflect the logic of the <code>forward</code> method (the name of the layer and where it is used is identical to the parent’s), then we do not even need to overwrite it! It will be unravelled automatically! Note that we redefined <code>self.layers</code> for the sake of being explicit, but this is not even strictly required here as the definition is similar to what is found in <code>Olmo</code> (v1).</p> <h3 class="relative group"><a id="finally-the-forcausallm-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#finally-the-forcausallm-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Finally… The ForCausalLM class</span></h3> <p data-svelte-h="svelte-hjuool">Finally, here is the definition of the <code>ForCausalLM</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..olmo.modeling_olmo <span class="hljs-keyword">import</span> OlmoForCausalLM
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2ForCausalLM</span>(<span class="hljs-title class_ inherited__">OlmoForCausalLM</span>):
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1p1nc5i">As for the <code>RMSNorm</code>, it is exactly similar to the parent’s in logic, so we do not have anything to do, the linter will all figure it out by itself. Almost disappointing, no?</p> <a id="dependencies"></a>
### But... What about the MLP, RotaryEmbedding and PreTrainedModel classes?
<p data-svelte-h="svelte-129wpkp">Indeed, if you inspect the file <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/models/olmo2/modeling_olmo2.py" rel="nofollow">modeling_olmo2.py</a> which is created by running the linter on <code>modular_olmo2.py</code>, you will notice that it also creates <code>Olmo2MLP</code>, <code>Olmo2RotaryEmbedding</code>, and <code>Olmo2PreTrainedModel</code> classes, that we did not define explicitly in <code>modular_olmo2.py</code>.</p> <p data-svelte-h="svelte-1iwt0rn">Well, it is one of the main feature of our modular linter. Similarly to how some functions were added automatically with the <code>Attention</code> class (without directly importing them), classes that are a dependency of one of the class inherited class and which are not explicitly defined in the modular file, will be added automatically as part of the dependeny tracing. For example, in <code>OlmoDecoderLayer</code>, there is an attribute defined as <code>self.mlp = OlmoMLP(config)</code>. Because we never explicitly redefined a class named <code>Olmo2MLP</code> in <code>modular_olmo2.py</code>, the linter automatically created a class <code>Olmo2MLP</code>, similar to <code>OlmoMLP</code>. This is exactly the same as if we had done:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> ..olmo.modeling_olmo <span class="hljs-keyword">import</span> OlmoMLP
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2MLP</span>(<span class="hljs-title class_ inherited__">OlmoMLP</span>):
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hx072i">but we did not even bother, because we <em>know</em> this class is supposed to be exactly similar, and we never needed it anywhere else in the <code>modular_olmo2.py</code> file. In contrast, the class <code>Olmo2RMSNorm</code> was needed to (re)define the norms both in the <code>Attention</code> and <code>DecoderLayer</code> classes. The same logic is true for the <code>Olmo2PreTrainedModel</code> and <code>Olmo2RotaryEmbedding</code> classes.</p> <p data-svelte-h="svelte-115qkhu">Note however that if not redefined, classes will be copied from the file in which an inherited module uses them first. So if you wanted e.g. <code>Olmo2MLP</code> to inherit from, say, <code>MistralMLP</code> instead of <code>OlmoMLP</code> (here it was <code>OlmoMLP</code> because it was first implicitly used in <code>Olmo2DecoderLayer</code>, which inherited from <code>OlmoDecoderLayer</code>), you would need to be explicit and do:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># switch to mistral definition</span>
<span class="hljs-keyword">from</span> ..mistral.modeling_mistral <span class="hljs-keyword">import</span> MistralMLP
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Olmo2MLP</span>(<span class="hljs-title class_ inherited__">MistralMLP</span>):
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="advanced-usage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#advanced-usage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Advanced usage</span></h2> <p data-svelte-h="svelte-2lsmzy">Now that you should have a good grasp of how modular works, let’s see some more advanced use cases and features you can use.</p> <h3 class="relative group"><a id="removing-attributes-which-are-not-just-assignments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#removing-attributes-which-are-not-just-assignments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Removing attributes which are not just assignments</span></h3> <p data-svelte-h="svelte-j6677p">As we have seen before, after using <code>super().__init__()</code>, we can use <code>del self.attribute</code> to remove a specific attribute which was defined in the parent. What if this attribute was used elsewhere though? Meaning it was not just “defined to be stored” as in the config for example. For example, consider the following case:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">DummyModel</span>(nn.Module):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: DummyConfig</span>):
<span class="hljs-built_in">super</span>().__init__()
self.attribute = config.attribute
<span class="hljs-keyword">if</span> self.attribute:
<span class="hljs-comment"># do more stuff with `self.attribute` here</span>
...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y6u0yw">Then inheriting from this <code>DummyModel</code> and doing</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">MyNewDummyModel</span>(<span class="hljs-title class_ inherited__">DummyModel</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: MyNewDummyConfig</span>):
<span class="hljs-built_in">super</span>().__init__(config)
<span class="hljs-keyword">del</span> self.attribute<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gbrqtm">is not supported, because it will only suppress the assignment, i.e. the line <code>self.attribute = config.attribute</code> will disappear, but the <code>if</code> statement will stay and reference the attribute. We tried to make it work by suppressing every mentions of the attribute, however it it not a sound solution in the general case (it can lead to very surprising effects and remove other important parts) and is therefore not possible.</p> <p data-svelte-h="svelte-1kphpee">But what if I still want to inherit from <code>DummyModel</code>? How to properly do it? How to use <code>super().__init__()</code> without copy/pasting the parent then? This brings us to the next point:</p> <h3 class="relative group"><a id="avoiding-super-special-meaning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#avoiding-super-special-meaning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Avoiding super() special meaning</span></h3> <p data-svelte-h="svelte-1bd6kt0">Say you still want to inherit from <code>DummyModel</code> (because it is convenient for some other methods) but you do want to remove the <code>self.attribute</code>. How to properly override the <code>__init__</code> method, while calling <code>super()</code> but without unravelling the parent’s code? Well, then be explicit about which class <code>super()</code>’s you are calling! If we want to call the <code>nn.Module</code>’s <code>super()</code> for example, we can do the following (unravelled code on the right):</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">MyNewDummyModel</span>(DummyModel, nn.Module): | <span class="hljs-keyword">class</span> <span class="hljs-title class_">MyNewDummyModel</span>(nn.Module):
|
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: MyNewDummyConfig</span>): | <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, config: MyNewDummyConfig</span>):
nn.Module.__init__(config) | <span class="hljs-built_in">super</span>().__init__()
self.foo = config.foo | self.foo = config.foo
... | ...<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="deleting-unused-methods" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deleting-unused-methods"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deleting unused methods</span></h3> <p data-svelte-h="svelte-c9w476">Removing a class method is pretty similar to remove an attribute, you just need to overwrite it with a <code>raise AttributeError(&quot;&quot;)</code> to mimick the behaviour you actually want when you remove a parent function in python. For example, the following will remove the methods in the unravelled code:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">GemmaTokenizer</span>(<span class="hljs-title class_ inherited__">LlamaTokenizer</span>):
...
<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_spm_processor</span>(<span class="hljs-params">self</span>):
<span class="hljs-keyword">raise</span> AttributeError(<span class="hljs-string">&quot;Not needed for Gemma&quot;</span>)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">unk_token_length</span>(<span class="hljs-params">self</span>):
<span class="hljs-keyword">raise</span> AttributeError(<span class="hljs-string">&quot;Not needed for Gemma&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="define-new-functions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#define-new-functions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Define new functions</span></h3> <p data-svelte-h="svelte-hvsyf6">Of course, if you define a new function in the <code>modular</code> file, and use it inside an inherited class, say</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">my_new_function</span>(<span class="hljs-params">*args, **kwargs</span>):
<span class="hljs-comment"># Do something here</span>
<span class="hljs-keyword">pass</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">DummyModel</span>(<span class="hljs-title class_ inherited__">LlamaModel</span>):
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">*args, **kwargs</span>):
<span class="hljs-comment"># Call the function</span>
example = my_new_function(*args, **kwargs)
<span class="hljs-comment"># continue here</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ov84rp">the <code>my_new_function</code> function (and, recursively, any other functions called in its body) will be automatically added to the unravelled code even if it is not present in the parent’s file (here Llama).</p> <h3 class="relative group"><a id="decorators" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decorators"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Decorators</span></h3> <p data-svelte-h="svelte-i7t16s">By default, if you inherit from a class and override a method which has 1 (or more) decorators in the parent’s method, the decorators will be added as well in the unravelled code, <em>but only if you do not add any yourself</em>. Otherwise, it will of course use whatever decorator your redefined.</p> <p data-svelte-h="svelte-1xerhbv">That, is, imagine the following parent class</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">DummyModel</span>(nn.Module):
...
<span class="hljs-meta"> @decorator(<span class="hljs-params">...</span>)</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">...</span>)
<span class="hljs-comment"># do stuff here</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3hueol">Then, if you simply override the method it will produce (modular on the left, unravelled code on the right):</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">NewModel</span>(<span class="hljs-title class_ inherited__">DummyModel</span>): | <span class="hljs-keyword">class</span> <span class="hljs-title class_">NewModel</span>(nn.Module):
... | ...
|
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">...</span>): | @decorator(...)
... | <span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">...</span>):
| ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gta4rc">That is, it keeps the parent’s decorators by default. However, if you do:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">NewModel</span>(<span class="hljs-title class_ inherited__">DummyModel</span>): | <span class="hljs-keyword">class</span> <span class="hljs-title class_">NewModel</span>(nn.Module):
... | ...
|
<span class="hljs-meta"> @my_new_decorator(<span class="hljs-params">...</span>) | @my_new_decorator(<span class="hljs-params">...</span>)</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">...</span>): | <span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">...</span>):
... | ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1u8jzso">Then it keeps you own new decorator.</p> <h3 class="relative group"><a id="the-superkwargs-special-case" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-superkwargs-special-case"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The super_kwargs special case</span></h3> <p data-svelte-h="svelte-cbveqz">In the above case about decorators, what if the <code>forward</code> method is really long, and I just want to switch the decorators? Do I really have to redefine it all and copy/paste the body just for the decorator? Fortunately, no. If you followed until this point, you now that you can use <code>super().forward(...)</code>, and it will unravel the parent’s body automatically. But what if there are plenty of arguments in the function’s signature, and we are very lazy? For that use-case, we introduced the special syntax <code>**super_kwargs</code> in the overriden method signature. It basically mean: “unravel all the parent’s signature arguments here”. For example, a common signature in the <code>ForCausalLM</code> model is the following (copied from llama’s modeling):</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">LlamaForCausalLM</span>(nn.Module):
...
<span class="hljs-meta"> @add_start_docstrings_to_model_forward(<span class="hljs-params">LLAMA_INPUTS_DOCSTRING</span>)</span>
<span class="hljs-meta"> @replace_return_docstrings(<span class="hljs-params">output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC</span>)</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">
self,
input_ids: torch.LongTensor = <span class="hljs-literal">None</span>,
attention_mask: <span class="hljs-type">Optional</span>[torch.Tensor] = <span class="hljs-literal">None</span>,
position_ids: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
past_key_values: <span class="hljs-type">Optional</span>[<span class="hljs-type">Union</span>[Cache, <span class="hljs-type">List</span>[torch.FloatTensor]]] = <span class="hljs-literal">None</span>,
inputs_embeds: <span class="hljs-type">Optional</span>[torch.FloatTensor] = <span class="hljs-literal">None</span>,
labels: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
use_cache: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
output_attentions: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
output_hidden_states: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
return_dict: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
cache_position: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
num_logits_to_keep: <span class="hljs-built_in">int</span> = <span class="hljs-number">0</span>,
**kwargs: Unpack[KwargsForCausalLM],
</span>) -&gt; <span class="hljs-type">Union</span>[<span class="hljs-type">Tuple</span>, CausalLMOutputWithPast]:
...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-154jhi">As you can see, this is a rather long and complicated signature. But if you do the following (as usual, modular on the left, unravelled code by the linter on the right):</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">NewModelForCausalLM</span>(<span class="hljs-title class_ inherited__">LlamaForCausalLM</span>): | <span class="hljs-keyword">class</span> <span class="hljs-title class_">LlamaForCausalLM</span>(nn.Module):
... | ...
|
<span class="hljs-meta"> @my_new_decorator | @my_new_decorator</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, **super_kwargs</span>): | <span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">
<span class="hljs-built_in">super</span>(<span class="hljs-params"></span>).forward(<span class="hljs-params">**super_kwargs</span>) | self,
| input_ids: torch.LongTensor = <span class="hljs-literal">None</span>,
| attention_mask: <span class="hljs-type">Optional</span>[torch.Tensor] = <span class="hljs-literal">None</span>,
| position_ids: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
| past_key_values: <span class="hljs-type">Optional</span>[<span class="hljs-type">Union</span>[Cache, <span class="hljs-type">List</span>[torch.FloatTensor]]] = |<span class="hljs-literal">None</span>,
| inputs_embeds: <span class="hljs-type">Optional</span>[torch.FloatTensor] = <span class="hljs-literal">None</span>,
| labels: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
| use_cache: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
| output_attentions: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
| output_hidden_states: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
| return_dict: <span class="hljs-type">Optional</span>[<span class="hljs-built_in">bool</span>] = <span class="hljs-literal">None</span>,
| cache_position: <span class="hljs-type">Optional</span>[torch.LongTensor] = <span class="hljs-literal">None</span>,
| num_logits_to_keep: <span class="hljs-built_in">int</span> = <span class="hljs-number">0</span>,
| **kwargs: Unpack[KwargsForCausalLM],
| </span>) -&gt; <span class="hljs-type">Union</span>[<span class="hljs-type">Tuple</span>, CausalLMOutputWithPast]:
| ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2ilo5h">and the <code>**super_kwargs</code> syntax unravelled all the arguments, while the <code>super().forward()</code> syntax unravelled the whole body! As you can see, this is great combo when you just want to switch the decorators, as it is very easy to use, and make it explicit that the only change you want to apply is the decorator.</p> <p data-svelte-h="svelte-sb4f3e">However, we want to make it clear that the <code>**super_kwargs</code> syntax is not a replacement to being explicit when you redefine your methods: if you actually overwrite the method (i.e. you do not call <code>super().method()</code>), then we want you to explicitly write the signature as you would usually. This is only a short-cut when switching decorators, and a few other niche cases.</p> <h3 class="relative group"><a id="the-docstring-variables" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-docstring-variables"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The DOCSTRING variables</span></h3> <p data-svelte-h="svelte-187vlwj">Usually, if whatever object is defned both in the modular file and the modeling file from which we inherit, then the definition of the modular takes precedence. However, this is not the case for assignments containing the pattern <code>DOCSTRING</code>. Indeed, we usually have variables defined as <code>MODEL_START_DOCSTRING</code> and <code>MODEL_INPUT_DOCSTRING</code> in the modeling files. These are just very big blocks of, well, docstrings… But they are (almost) always exactly the same up to the model name! And modular automatically rewrite the names everywhere! For this reason, assignments containing the pattern will <em>always</em> use the definition found in the source file instead of the modular file. This is extremely handy if we need the variable reference somewhere (e.g. to redefine a decorator) but we do not want to clutter the modular file with 100 lines of docstrings which are always the same. It allows to do the following (taken from <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/models/starcoder2/modular_starcoder2.py#L146" rel="nofollow">modular_starcoder2.py</a>)</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->STARCODER2_INPUTS_DOCSTRING = <span class="hljs-literal">None</span> <span class="hljs-comment"># will be automatically redefined</span>
<span class="hljs-keyword">class</span> <span class="hljs-title class_">Starcoder2Model</span>(<span class="hljs-title class_ inherited__">MistralModel</span>):
...
<span class="hljs-meta"> @add_start_docstrings_to_model_forward(<span class="hljs-params">STARCODER2_INPUTS_DOCSTRING</span>)</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">...</span>)
...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sm5l0i">and here, the linter will correctly take the same definition of the docstring as in <code>Mistral</code>, without having to clutter the modular file!</p> <h2 class="relative group"><a id="limitations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#limitations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Limitations</span></h2> <p data-svelte-h="svelte-168b6cn">Now, let’s go over some of the limitations of modular.</p> <h3 class="relative group"><a id="special-naming-essentially-for-multimodal-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#special-naming-essentially-for-multimodal-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Special naming (essentially for multimodal models)</span></h3> <p data-svelte-h="svelte-1gs6a5h">Because our linter automatically renames everything when inheriting from a class (defining <code>class NewModelMLP(LlamaMLP)</code> will rename every mention of <code>Llama</code> to <code>NewModel</code>, and recursively for all dependencies grabbed), it has somewhat strict rules when it comes to naming. For consistency reasons, we require that you always use the same class name prefix when inheriting different classes from the same file. For example, doing:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">MyModelIncredibleMLP</span>(<span class="hljs-title class_ inherited__">LlamaMLP</span>):
...
<span class="hljs-keyword">class</span> <span class="hljs-title class_">MyModelDecoderLayer</span>(<span class="hljs-title class_ inherited__">LlamaDecoderLayer</span>):
...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-um3kky">is not recommended, first because it breaks standards in the library and we do not like it, and second because the linter will not know how to rename potential high-order dependencies (should we use <code>MyModelIncredible</code>, or <code>MyModel</code>?).</p> <p data-svelte-h="svelte-143l68k">If there are no dependencies to grab implicitly however (see <a href="#dependencies">this section</a> to understand implicit dependencies), local renaming (for a single class) will not be an issue and the linter will not complain. But make sure to explicitly redefine every other mentions of the class with the new name pattern! For example in the example above, all mentions of <code>LlamaMLP</code> in other modules inherited should be explicitly replaced by mentions to <code>MyModelIncredibleMLP</code>, otherwise the linter may add a new and unwanted <code>MyModelMLP</code> class!</p> <p data-svelte-h="svelte-1b02g9p">In any way, if there is an ambiguous case detected, the linter will raise a warning such as</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->We detected multiple prefix names when inheriting <span class="hljs-built_in">from</span> transformers.models.llama.modeling_llama: (<span class="hljs-string">&#x27;Emu3Text&#x27;</span>, <span class="hljs-string">&#x27;Emu3&#x27;</span>). We will only use <span class="hljs-keyword">the</span> most used <span class="hljs-string">&#x27;Emu3&#x27;</span> prefix when grabbing args <span class="hljs-keyword">and</span> dependencies. Make sure <span class="hljs-built_in">to</span> subclass <span class="hljs-keyword">the</span> intermediate classes <span class="hljs-keyword">with</span> <span class="hljs-keyword">the</span> prefix you want (<span class="hljs-keyword">if</span> different <span class="hljs-built_in">from</span> <span class="hljs-string">&#x27;Emu3&#x27;</span>) <span class="hljs-keyword">or</span> use <span class="hljs-keyword">a</span> single prefix <span class="hljs-keyword">in</span> all <span class="hljs-keyword">the</span> modular (best).<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-152u23">explaining what is happening, and which prefix is used by default for grabbing dependencies. As explained, if you see automatic dependencies appear with a prefix but you want another one, then explicitly rename these classes locally with a simple <code>pass</code> class, such as</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">Emu3TextMLP</span>(<span class="hljs-title class_ inherited__">LlamaMLP</span>):
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sm5een">Such warnings and renaming patterns complications usually only arise when defining multimodel models, when you want to define e.g. the text part of your model from an existing model, but want to add the part <code>Text</code> to the class names to make it clear what they refer to in the multimodal setup.</p> <h3 class="relative group"><a id="automatic-docstrings-issue-mostly-for-configs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#automatic-docstrings-issue-mostly-for-configs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Automatic docstrings issue (mostly for Configs)</span></h3> <p data-svelte-h="svelte-1i3zif8">When inheriting a Config class and adding or deleting some attributes, it may be tempting to only redefine the new attributes in the docstring, and hoping that modular will do the rest. And similarly when deleting an argument, do nothing and hope that modular will remove itself from the docstring. However, due to current limitations of our linter, this is not yet supported. Thus, if you are in this case, you need to directly put the whole docstring (as it should appear in the end, with the correct arguments and default values) directly in the modular file under the class definition.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/modular_transformers.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_tduuc7 = {
assets: "/docs/transformers/pr_36049/en",
base: "/docs/transformers/pr_36049/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/pr_36049/en/_app/immutable/entry/start.86af8b85.js"),
import("/docs/transformers/pr_36049/en/_app/immutable/entry/app.d602e208.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 393],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
124 kB
·
Xet hash:
064fe66f981108f79df6bf592c4ed2f365e631c9cb23747d77b19f7162d0ff6b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.