model-architectures / aimv2.diff.svg
ArthurZ's picture
ArthurZ HF Staff
Universal module-tree renderer (conv/codec/FFT/SSM) + standardized component colors
427dd4e verified
|
Raw
History Blame Contribute Delete
37.4 kB
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1500 1113" width="1500" height="1113" font-size="14">
<style>
:root {
--bg: #ffffff; --fg: #1b1f24; --muted: #6b7280; --panel: #f6f8fa; --grid: #e5e7eb;
--embed: #dbeafe; --embed-s: #3b82f6;
--attn: #cffafe; --attn-s: #06b6d4;
--mamba: #dcfce7; --mamba-s: #22c55e;
--linattn: #fce7f3; --linattn-s: #ec4899;
--recur: #ede9fe; --recur-s: #8b5cf6;
--moe: #ffedd5; --moe-s: #f97316;
--mlp: #ede9fe; --mlp-s: #8b5cf6;
--norm: #e5e7eb; --norm-s: #9ca3af;
--head: #fee2e2; --head-s: #ef4444;
--config: #f1f5f9; --config-s: #64748b;
--rope: #fef9c3; --rope-s: #eab308;
--layer: #f8fafc; --layer-s: #cbd5e1;
--io: #f1f5f9; --io-s: #94a3b8;
--soft: #fae8ff; --soft-s: #c026d3;
--add: #ffffff; --add-s: #475569;
--block-s: #94a3b8;
--residual: #f59e0b;
--added: #16a34a; --over: #d97706; --deleted: #dc2626;
--lt-full: #06b6d4; --lt-sliding: #3b82f6; --lt-chunked: #8b5cf6;
--lt-compressed: #f97316; --lt-heavy: #dc2626; --lt-linear: #ec4899; --lt-mamba: #22c55e;
--cell-on: #0ea5e9; --cell-off: #e5e7eb;
--vision: #dcfce7; --vision-s: #16a34a; --audio: #fae8ff; --audio-s: #c026d3;
--proj: #fef3c7; --proj-s: #d97706; --xattn: #db2777;
--conv: #d1fae5; --conv-s: #10b981; --act: #ecfccb; --act-s: #65a30d;
--pool: #e0f2fe; --pool-s: #0284c7; --quant: #fae8ff; --quant-s: #c026d3;
}
@media (prefers-color-scheme: dark) {
:root {
--bg: #0d1117; --fg: #e6edf3; --muted: #8b949e; --panel: #161b22; --grid: #30363d;
--embed: #172554; --attn: #083344; --mamba: #052e16; --linattn: #500724;
--recur: #2e1065; --moe: #431407; --mlp: #2e1065; --norm: #21262d; --head: #450a0a;
--config: #1e293b; --rope: #422006; --layer: #161b22;
--conv: #022c22; --act: #1a2e05; --pool: #082f49; --quant: #3b0764; --proj: #422006;
--io: #1e293b; --soft: #3b0764; --add: #0d1117; --cell-off: #21262d;
}
}
.bg { fill: var(--bg); }
text { font-family: ui-sans-serif, -apple-system, "Segoe UI", Roboto, sans-serif; fill: var(--fg); }
.title { font-size: 22px; font-weight: 700; }
.subtitle { font-size: 13px; fill: var(--muted); }
.box-label { font-size: 14px; font-weight: 600; }
.box-label.sm { font-size: 12.5px; }
.box-sub { font-size: 11px; fill: var(--muted); }
.glyph { font-size: 18px; font-weight: 700; fill: var(--add-s); }
.badge { font-size: 12px; font-weight: 700; fill: var(--fg); }
.facts-k { font-size: 11.5px; fill: var(--muted); }
.facts-v { font-size: 11.5px; font-weight: 600; }
.legend-t { font-size: 11.5px; fill: var(--fg); }
.panel { fill: var(--panel); stroke: var(--grid); }
rect.b { rx: 9; stroke-width: 1.6; }
.c-embed { fill: var(--embed); stroke: var(--embed-s); }
.c-attn { fill: var(--attn); stroke: var(--attn-s); }
.c-mamba { fill: var(--mamba); stroke: var(--mamba-s); }
.c-linattn { fill: var(--linattn); stroke: var(--linattn-s); }
.c-recur { fill: var(--recur); stroke: var(--recur-s); }
.c-moe { fill: var(--moe); stroke: var(--moe-s); }
.c-mlp { fill: var(--mlp); stroke: var(--mlp-s); }
.c-norm { fill: var(--norm); stroke: var(--norm-s); }
.c-head { fill: var(--head); stroke: var(--head-s); }
.c-config{ fill: var(--config);stroke: var(--config-s); }
.c-rope { fill: var(--rope); stroke: var(--rope-s); }
.c-proj { fill: var(--proj); stroke: var(--proj-s); }
.c-conv { fill: var(--conv); stroke: var(--conv-s); }
.c-act { fill: var(--act); stroke: var(--act-s); }
.c-pool { fill: var(--pool); stroke: var(--pool-s); }
.c-quant { fill: var(--quant); stroke: var(--quant-s); }
.c-layer { fill: var(--layer); stroke: var(--layer-s); }
.c-io { fill: var(--io); stroke: var(--io-s); }
.c-soft { fill: var(--soft); stroke: var(--soft-s); }
.c-add { fill: var(--add); stroke: var(--add-s); }
.c-block { fill: none; stroke: var(--block-s); stroke-width: 1.6; stroke-dasharray: 7 5; }
.c-lt-full { fill: var(--lt-full); stroke: var(--lt-full); }
.c-lt-sliding { fill: var(--lt-sliding); stroke: var(--lt-sliding); }
.c-lt-chunked { fill: var(--lt-chunked); stroke: var(--lt-chunked); }
.c-lt-compressed { fill: var(--lt-compressed); stroke: var(--lt-compressed); }
.c-lt-heavy { fill: var(--lt-heavy); stroke: var(--lt-heavy); }
.c-lt-linear { fill: var(--lt-linear); stroke: var(--lt-linear); }
.c-lt-mamba { fill: var(--lt-mamba); stroke: var(--lt-mamba); }
.cell-on { fill: var(--cell-on); }
.cell-off { fill: var(--cell-off); }
.grid-frame { fill: none; stroke: var(--grid); stroke-width: 1; }
.mask-bg { fill: var(--cell-off); }
.mask-on { fill: #22c55e; }
.mask-div { stroke: var(--fg); stroke-width: 1.5; stroke-dasharray: 3 2; }
.c-vision { fill: var(--vision); stroke: var(--vision-s); }
.c-audio { fill: var(--audio); stroke: var(--audio-s); }
.c-proj { fill: var(--proj); stroke: var(--proj-s); }
.c-sub { fill: var(--bg); stroke: var(--block-s); stroke-width: 1.2; }
.sec-h { font-size: 12px; font-weight: 700; }
.sec-hbar { fill: var(--bg); opacity: 0.82; }
.residual.xattn { stroke: var(--xattn); stroke-width: 2.4; }
.ghost { opacity: 0.32; stroke-dasharray: 4 3; }
.ch-added rect.b, rect.b.ch-added { stroke: var(--added); stroke-width: 3.2; }
.ch-over rect.b, rect.b.ch-over { stroke: var(--over); stroke-width: 3.2; }
.ch-deleted rect.b, rect.b.ch-deleted { stroke: var(--deleted); stroke-width: 3.2; }
.edge { stroke: var(--grid); stroke-width: 2; }
.flow { stroke: var(--grid); stroke-width: 2; fill: none; }
.residual { stroke: var(--residual); stroke-width: 2; fill: none; }
.rope { stroke: var(--rope-s); stroke-width: 2.2; fill: none; }
.xattn { stroke: var(--xattn); stroke-width: 2.4; fill: none; }
.cell-idx { font-size: 9px; fill: #ffffff; font-weight: 600; }
.sky { fill: #bae6fd; } .sun { fill: #fde047; } .hill { fill: #4ade80; }
</style>
<defs><marker id="ah-flow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--grid)"/></marker><marker id="ah-residual" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--residual)"/></marker><marker id="ah-rope" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--rope-s)"/></marker><marker id="ah-xattn" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--xattn)"/></marker></defs>
<rect class="bg" x="0" y="0" width="1500" height="1113"/>
<text class="title" x="24" y="34">aimv2</text>
<text class="subtitle" x="24" y="54">diff vs siglip (+clip,llama) · 3 overridden · 62 added · 0 deleted · 9 new · 5 inherited-as-is</text>
<line class="edge" x1="785" y1="108" x2="785" y2="1071"/>
<g class=""><rect class="c-block" x="586" y="184" width="398" height="753" rx="14"/><text class="box-sub" x="598" y="202">Aimv2Model · base model</text></g>
<g class=""><title>12 × Aimv2EncoderLayer</title><rect class="c-block" x="594" y="257" width="382" height="640" rx="14"/><text class="box-sub" x="606" y="275">Aimv2EncoderLayer</text><text class="badge" x="964" y="277" text-anchor="end">× 12</text></g>
<g class=""><rect class="c-block" x="32" y="766" width="258" height="16" rx="14"/><text class="box-sub" x="44" y="784">image ⊕ text attention</text></g>
<g class=""><rect class="b c-vision ch-over" x="36" y="252" width="250" height="414" rx="11"/><rect class="sec-hbar" x="41" y="256" width="240" height="18" rx="5"/><text class="sec-h" x="48" y="269">Vision encoder · Aimv2VisionM…</text><text class="badge" x="274" y="269" text-anchor="end">×24</text></g>
<g class=""><rect class="b c-attn ch-over" x="620" y="321" width="330" height="280" rx="11"/><rect class="sec-hbar" x="625" y="325" width="320" height="18" rx="5"/><text class="sec-h" x="632" y="338">Self-Attention · Aimv2Attention</text></g>
<g class=""><rect class="b c-vision ch-over" x="46" y="278" width="230" height="382" rx="11"/><rect class="sec-hbar" x="51" y="282" width="220" height="18" rx="5"/><text class="sec-h" x="58" y="295">Aimv2EncoderLayer</text><text class="badge" x="264" y="295" text-anchor="end">×24</text></g>
<g class="ghost"><rect class="b c-mlp " x="620" y="697" width="330" height="152" rx="11"/><rect class="sec-hbar" x="625" y="701" width="320" height="18" rx="5"/><text class="sec-h" x="632" y="714">MLP · SiglipMLP ↩ inherited</text></g>
<g class=""><rect class="b c-sub ch-over" x="54" y="300" width="214" height="143" rx="11"/><rect class="sec-hbar" x="59" y="304" width="204" height="18" rx="5"/><text class="sec-h" x="66" y="317">attention · Aimv2Attention</text></g>
<g class="ghost"><rect class="b c-sub " x="54" y="449" width="214" height="143" rx="11"/><rect class="sec-hbar" x="59" y="453" width="204" height="18" rx="5"/><text class="sec-h" x="66" y="466">ffn · SiglipMLP ↩ inherited</text></g>
<polyline class="residual" points="620,279 586,279 586,628 620,628" stroke-dasharray="6 4" marker-end="url(#ah-residual)"/>
<polyline class="residual" points="620,628 570,628 570,876 620,876" stroke-dasharray="6 4" marker-end="url(#ah-residual)"/>
<polyline class="flow" points="161,184 161,206" marker-end="url(#ah-flow)"/>
<polyline class="flow" points="161,236 161,252" marker-end="url(#ah-flow)"/>
<polyline class="flow" points="161,666 161,682" marker-end="url(#ah-flow)"/>
<polyline class="flow" points="286,708 453,708 453,220 620,220" marker-end="url(#ah-flow)"/>
<g class=""><rect class="b c-io" x="620" y="108" width="330" height="32" rx="16"/><text class="box-label sm" x="785.0" y="122.0" text-anchor="middle">input_ids</text><text class="box-sub" x="785.0" y="137.0" text-anchor="middle">tokenize(&quot;Hey, how are you?&quot;) → [1, 6]</text></g>
<g class="ghost"><title>token 0: &#x27;hey&lt;/w&gt;&#x27;</title><rect class="b c-sub" x="620" y="154" width="50" height="26" rx="9"/><text class="box-label sm" x="645.0" y="165.0" text-anchor="middle">hey&lt;</text><text class="box-sub" x="645.0" y="180.0" text-anchor="middle">0</text></g>
<g class="ghost"><title>token 1: &#x27;,&lt;/w&gt;&#x27;</title><rect class="b c-sub" x="675" y="154" width="50" height="26" rx="9"/><text class="box-label sm" x="700.0" y="165.0" text-anchor="middle">,&lt;/w&gt;</text><text class="box-sub" x="700.0" y="180.0" text-anchor="middle">1</text></g>
<g class="ghost"><title>token 2: &#x27;how&lt;/w&gt;&#x27;</title><rect class="b c-sub" x="731" y="154" width="50" height="26" rx="9"/><text class="box-label sm" x="756.0" y="165.0" text-anchor="middle">how&lt;</text><text class="box-sub" x="756.0" y="180.0" text-anchor="middle">2</text></g>
<g class="ghost"><title>token 3: &#x27;are&lt;/w&gt;&#x27;</title><rect class="b c-sub" x="787" y="154" width="50" height="26" rx="9"/><text class="box-label sm" x="812.0" y="165.0" text-anchor="middle">are&lt;</text><text class="box-sub" x="812.0" y="180.0" text-anchor="middle">3</text></g>
<g class="ghost"><title>token 4: &#x27;you&lt;/w&gt;&#x27;</title><rect class="b c-sub" x="843" y="154" width="50" height="26" rx="9"/><text class="box-label sm" x="868.0" y="165.0" text-anchor="middle">you&lt;</text><text class="box-sub" x="868.0" y="180.0" text-anchor="middle">4</text></g>
<g class="ghost"><title>token 5: &#x27;?&lt;/w&gt;&#x27;</title><rect class="b c-sub" x="899" y="154" width="50" height="26" rx="9"/><text class="box-label sm" x="924.0" y="165.0" text-anchor="middle">?&lt;/w&gt;</text><text class="box-sub" x="924.0" y="180.0" text-anchor="middle">5</text></g>
<g class=""><rect class="b c-embed ch-over" x="620" y="198" width="330" height="45" rx="9"/><text class="box-label sm" x="785.0" y="218.5" text-anchor="middle">Token Embedding</text><text class="box-sub" x="785.0" y="233.5" text-anchor="middle">weight [49408 × 768] → [1, 6, 768]</text><circle cx="632" cy="210" r="5" fill="var(--over)"/></g>
<g class="ghost"><rect class="b c-norm" x="620" y="279" width="330" height="28" rx="9"/><text class="box-label sm" x="785.0" y="291.0" text-anchor="middle">Aimv2RMSNorm</text><text class="box-sub" x="785.0" y="306.0" text-anchor="middle">pre-attention [1, 6, 768]</text></g>
<g class="ghost"><title>k_proj: Linear [768→768]</title><rect class="b c-proj" x="630" y="347" width="151" height="22" rx="9"/><text class="box-label sm" x="705.5" y="362.5" text-anchor="middle">k_proj [768→768]</text></g>
<g class="ghost"><title>v_proj: Linear [768→768]</title><rect class="b c-proj" x="789" y="347" width="151" height="22" rx="9"/><text class="box-label sm" x="864.5" y="362.5" text-anchor="middle">v_proj [768→768]</text></g>
<g class="ghost"><title>q_proj: Linear [768→768]</title><rect class="b c-proj" x="630" y="377" width="151" height="22" rx="9"/><text class="box-label sm" x="705.5" y="392.5" text-anchor="middle">q_proj [768→768]</text></g>
<g class="ghost"><title>out_proj: Linear [768→768]</title><rect class="b c-proj" x="789" y="377" width="151" height="22" rx="9"/><text class="box-label sm" x="864.5" y="392.5" text-anchor="middle">out_proj [768→768]</text></g>
<g class="ghost"><rect class="b c-sub" x="630" y="407" width="310" height="30" rx="9"/><text class="box-label sm" x="785.0" y="420.0" text-anchor="middle">scaled dot-product attention</text><text class="box-sub" x="785.0" y="435.0" text-anchor="middle">6 heads · head_dim 128</text></g>
<g class=""><title>causal attention mask (q↓ × k→)</title><text class="box-sub" x="715" y="449">causal mask</text><rect class="mask-bg" x="715" y="455" width="140.0" height="140.0"/><rect class="mask-on" x="715.0" y="455.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="465.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="465.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="475.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="475.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="475.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="485.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="485.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="485.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="485.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="495.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="495.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="495.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="495.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="495.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="505.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="505.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="505.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="505.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="505.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="505.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="515.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="525.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="795.0" y="535.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="795.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="805.0" y="545.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="795.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="805.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="815.0" y="555.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="795.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="805.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="815.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="825.0" y="565.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="795.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="805.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="815.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="825.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="835.0" y="575.0" width="10.0" height="10.0"/><rect class="mask-on" x="715.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="725.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="735.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="745.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="755.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="765.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="775.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="785.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="795.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="805.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="815.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="825.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="835.0" y="585.0" width="10.0" height="10.0"/><rect class="mask-on" x="845.0" y="585.0" width="10.0" height="10.0"/><rect class="grid-frame" x="715" y="455" width="140.0" height="140.0"/></g>
<g class=""><title>residual add</title><circle class="b c-add " cx="785.0" cy="628" r="13"/><text class="glyph" x="785.0" y="634" text-anchor="middle">+</text></g>
<g class="ghost"><rect class="b c-norm" x="620" y="655" width="330" height="28" rx="9"/><text class="box-label sm" x="785.0" y="667.0" text-anchor="middle">Aimv2RMSNorm</text><text class="box-sub" x="785.0" y="682.0" text-anchor="middle">pre-FFN [1, 6, 768]</text></g>
<g class=""><title>gate_proj: Linear [768→2048]</title><rect class="b c-proj ch-added" x="632" y="723" width="306" height="22" rx="9"/><text class="box-label sm" x="785.0" y="738.5" text-anchor="middle">gate_proj [768→2048]</text><circle cx="644" cy="735" r="5" fill="var(--added)"/></g>
<g class=""><title>up_proj: Linear [768→2048]</title><rect class="b c-proj ch-added" x="632" y="753" width="306" height="22" rx="9"/><text class="box-label sm" x="785.0" y="768.5" text-anchor="middle">up_proj [768→2048]</text><circle cx="644" cy="765" r="5" fill="var(--added)"/></g>
<g class=""><title>down_proj: Linear [2048→768]</title><rect class="b c-proj ch-added" x="632" y="783" width="306" height="22" rx="9"/><text class="box-label sm" x="785.0" y="798.5" text-anchor="middle">down_proj [2048→768]</text><circle cx="644" cy="795" r="5" fill="var(--added)"/></g>
<g class=""><title>act_fn: SiLUActivation</title><rect class="b c-act ch-added" x="632" y="813" width="306" height="22" rx="9"/><text class="box-label sm" x="785.0" y="828.5" text-anchor="middle">act_fn SiLU</text><circle cx="644" cy="825" r="5" fill="var(--added)"/></g>
<g class=""><title>residual add</title><circle class="b c-add " cx="785.0" cy="876" r="13"/><text class="glyph" x="785.0" y="882" text-anchor="middle">+</text></g>
<g class="ghost"><rect class="b c-norm" x="620" y="903" width="330" height="28" rx="9"/><text class="box-label sm" x="785.0" y="915.0" text-anchor="middle">Final RMSNorm</text><text class="box-sub" x="785.0" y="930.0" text-anchor="middle">[1, 6, 768]</text></g>
<g class=""><rect class="b c-head ch-over" x="620" y="945" width="330" height="42" rx="9"/><text class="box-label sm" x="785.0" y="964.0" text-anchor="middle">LM Head</text><text class="box-sub" x="785.0" y="979.0" text-anchor="middle">Linear [768→49408]</text><circle cx="632" cy="957" r="5" fill="var(--over)"/></g>
<g class="ghost"><rect class="b c-soft" x="620" y="1001" width="330" height="26" rx="9"/><text class="box-label sm" x="785.0" y="1018.5" text-anchor="middle">Softmax</text></g>
<g class=""><rect class="b c-io" x="620" y="1041" width="330" height="30" rx="15"/><text class="box-label sm" x="785.0" y="1054.0" text-anchor="middle">logits</text><text class="box-sub" x="785.0" y="1069.0" text-anchor="middle">[1, 6, 49408]</text></g>
<g class="ghost"><rect class="b c-vision" x="123" y="120" width="76" height="64" rx="6"/><clipPath id="ic123120"><rect x="129" y="126" width="64" height="52" rx="3"/></clipPath><g clip-path="url(#ic123120)"><rect class="sky" x="129" y="126" width="64" height="52"/><circle class="sun" cx="175" cy="143" r="8"/><path class="hill" d="M129,178 L151,152 L167,178 Z"/><path class="hill" d="M158,178 L177,155 L193,178 Z"/></g><rect class="grid-frame" x="129" y="126" width="64" height="52" rx="3"/><text class="box-sub" x="161.0" y="196" text-anchor="middle">example image</text></g>
<g class=""><rect class="b c-io" x="36" y="206" width="250" height="30" rx="15"/><text class="box-label sm" x="161.0" y="219.0" text-anchor="middle">pixel_values</text><text class="box-sub" x="161.0" y="234.0" text-anchor="middle">[1, 3, 336, 336]</text></g>
<g class="ghost"><title>k_proj: Linear [1024→1024]</title><rect class="b c-proj" x="62" y="325" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="340.5" text-anchor="middle">k_proj [1024→1024]</text></g>
<g class="ghost"><title>v_proj: Linear [1024→1024]</title><rect class="b c-proj" x="62" y="353" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="368.5" text-anchor="middle">v_proj [1024→1024]</text></g>
<g class="ghost"><title>q_proj: Linear [1024→1024]</title><rect class="b c-proj" x="62" y="381" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="396.5" text-anchor="middle">q_proj [1024→1024]</text></g>
<g class="ghost"><title>out_proj: Linear [1024→1024]</title><rect class="b c-proj" x="62" y="409" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="424.5" text-anchor="middle">out_proj [1024→1024]</text></g>
<g class=""><title>gate_proj: Linear [1024→2816]</title><rect class="b c-proj ch-added" x="62" y="474" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="489.5" text-anchor="middle">gate_proj [1024→2816]</text><circle cx="74" cy="486" r="5" fill="var(--added)"/></g>
<g class=""><title>up_proj: Linear [1024→2816]</title><rect class="b c-proj ch-added" x="62" y="502" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="517.5" text-anchor="middle">up_proj [1024→2816]</text><circle cx="74" cy="514" r="5" fill="var(--added)"/></g>
<g class=""><title>down_proj: Linear [2816→1024]</title><rect class="b c-proj ch-added" x="62" y="530" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="545.5" text-anchor="middle">down_proj [2816→1024]</text><circle cx="74" cy="542" r="5" fill="var(--added)"/></g>
<g class=""><title>act_fn: SiLUActivation</title><rect class="b c-act ch-added" x="62" y="558" width="198" height="22" rx="9"/><text class="box-label sm" x="161.0" y="573.5" text-anchor="middle">act_fn SiLU</text><circle cx="74" cy="570" r="5" fill="var(--added)"/></g>
<g class=""><title>rms_norm1: Aimv2RMSNorm 1024</title><rect class="b c-norm ch-added" x="54" y="598" width="214" height="22" rx="9"/><text class="box-label sm" x="161.0" y="613.5" text-anchor="middle">rms_norm1 RMSNorm 1024</text><circle cx="66" cy="610" r="5" fill="var(--added)"/></g>
<g class=""><title>rms_norm2: Aimv2RMSNorm 1024</title><rect class="b c-norm ch-added" x="54" y="626" width="214" height="22" rx="9"/><text class="box-label sm" x="161.0" y="641.5" text-anchor="middle">rms_norm2 RMSNorm 1024</text><circle cx="66" cy="638" r="5" fill="var(--added)"/></g>
<g class="ghost"><rect class="b c-proj" x="36" y="682" width="250" height="52" rx="9"/><text class="box-label sm" x="161.0" y="706.0" text-anchor="middle">Projector · Linear</text><text class="box-sub" x="161.0" y="721.0" text-anchor="middle">align features → text hidden dim</text></g>
<g class="ghost"><rect class="b c-proj" x="375" y="190" width="168" height="18" rx="9"/><text class="box-label sm" x="459.0" y="203.5" text-anchor="middle">merge at modality tokens</text></g>
<g class=""><title>image↓text query × image|text key</title><text class="box-sub" x="83" y="782">5 img + 7 text · fully causal</text><rect class="mask-bg" x="83" y="788" width="156.0" height="156.0"/><rect class="mask-on" x="83.0" y="788.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="801.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="801.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="814.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="814.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="814.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="827.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="827.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="827.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="827.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="840.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="840.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="840.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="840.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="840.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="853.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="853.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="853.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="853.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="853.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="853.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="161.0" y="866.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="161.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="174.0" y="879.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="161.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="174.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="187.0" y="892.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="161.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="174.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="187.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="200.0" y="905.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="161.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="174.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="187.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="200.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="213.0" y="918.0" width="13.0" height="13.0"/><rect class="mask-on" x="83.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="96.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="109.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="122.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="135.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="148.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="161.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="174.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="187.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="200.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="213.0" y="931.0" width="13.0" height="13.0"/><rect class="mask-on" x="226.0" y="931.0" width="13.0" height="13.0"/><line class="mask-div" x1="148.0" y1="788" x2="148.0" y2="944.0"/><rect class="grid-frame" x="83" y="788" width="156.0" height="156.0"/></g>
<rect class="panel" x="1212" y="88" width="264" height="174" rx="8"/>
<text class="facts-k" x="1226" y="112">model id</text>
<text class="facts-v" x="1462" y="112" text-anchor="end">apple/aimv2-large-patch14-224-</text>
<text class="facts-k" x="1226" y="134">parent</text>
<text class="facts-v" x="1462" y="134" text-anchor="end">siglip</text>
<text class="facts-k" x="1226" y="156">classes</text>
<text class="facts-v" x="1462" y="156" text-anchor="end">16</text>
<text class="facts-k" x="1226" y="178">overridden</text>
<text class="facts-v" x="1462" y="178" text-anchor="end">3</text>
<text class="facts-k" x="1226" y="200">added</text>
<text class="facts-v" x="1462" y="200" text-anchor="end">62</text>
<text class="facts-k" x="1226" y="222">new classes</text>
<text class="facts-v" x="1462" y="222" text-anchor="end">9</text>
<text class="facts-k" x="1226" y="244">inherited as-is</text>
<text class="facts-v" x="1462" y="244" text-anchor="end">5</text>
<text class="legend-t" x="1214" y="282" font-weight="700">legend</text>
<rect x="1216" y="293" width="16" height="12" rx="2" fill="none" stroke="var(--added)" stroke-width="3"/>
<text class="legend-t" x="1240" y="303">new submodule (vs parent)</text>
<rect x="1216" y="311" width="16" height="12" rx="2" fill="none" stroke="var(--over)" stroke-width="3"/>
<text class="legend-t" x="1240" y="321">changed / redefined</text>
<rect x="1216" y="329" width="16" height="12" rx="2" fill="none" stroke="var(--deleted)" stroke-width="3"/>
<text class="legend-t" x="1240" y="339">deleted</text>
<rect x="1216" y="347" width="16" height="12" rx="2" fill="none" stroke="var(--grid)" stroke-width="3"/>
<text class="legend-t" x="1240" y="357">inherited / copy-pasted</text>
<text class="legend-t" x="1214" y="380" font-weight="700">changes by class</text>
<circle cx="1220" cy="393" r="4" fill="var(--over)"/>
<text class="facts-v" x="1232" y="396">Aimv2Attention</text>
<text class="box-sub" x="1232" y="409">ovr __init__</text>
<circle cx="1220" cy="421" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="424">Aimv2AttentionPoolingHead</text>
<text class="box-sub" x="1232" y="437">add __init__,forward</text>
<circle cx="1220" cy="449" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="452">Aimv2Config</text>
<text class="box-sub" x="1232" y="465">3 attr</text>
<circle cx="1220" cy="477" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="480">Aimv2EncoderLayer</text>
<text class="box-sub" x="1232" y="493">add __init__,forward</text>
<circle cx="1220" cy="505" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="508">Aimv2Model</text>
<text class="box-sub" x="1232" y="521">ovr __init__,forward; 1 attr</text>
<circle cx="1220" cy="533" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="536">Aimv2PreTrainedModel</text>
<text class="box-sub" x="1232" y="549">add _init_weights; 8 attr</text>
<circle cx="1220" cy="561" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="564">Aimv2TextConfig</text>
<text class="box-sub" x="1232" y="577">add __post_init__; 15 attr</text>
<circle cx="1220" cy="589" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="592">Aimv2TextModel</text>
<text class="box-sub" x="1232" y="605">add __init__,forward,get_input_embeddings,se…</text>
<circle cx="1220" cy="617" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="620">Aimv2VisionConfig</text>
<text class="box-sub" x="1232" y="633">14 attr</text>
<circle cx="1220" cy="645" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="648">Aimv2VisionEmbeddings</text>
<text class="box-sub" x="1232" y="661">add __init__,forward</text>
<circle cx="1220" cy="673" r="4" fill="var(--added)"/>
<text class="facts-v" x="1232" y="676">Aimv2VisionModel</text>
<text class="box-sub" x="1232" y="689">add __init__,forward,get_input_embeddings; 3…</text>
</svg>