File size: 29,907 Bytes
3c3356c 427dd4e 3c3356c 427dd4e 3c3356c 427dd4e 3c3356c 427dd4e 3c3356c 427dd4e 3c3356c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1200 1076" width="1200" height="1076" font-size="14">
<style>
:root {
--bg: #ffffff; --fg: #1b1f24; --muted: #6b7280; --panel: #f6f8fa; --grid: #e5e7eb;
--embed: #dbeafe; --embed-s: #3b82f6;
--attn: #cffafe; --attn-s: #06b6d4;
--mamba: #dcfce7; --mamba-s: #22c55e;
--linattn: #fce7f3; --linattn-s: #ec4899;
--recur: #ede9fe; --recur-s: #8b5cf6;
--moe: #ffedd5; --moe-s: #f97316;
--mlp: #ede9fe; --mlp-s: #8b5cf6;
--norm: #e5e7eb; --norm-s: #9ca3af;
--head: #fee2e2; --head-s: #ef4444;
--config: #f1f5f9; --config-s: #64748b;
--rope: #fef9c3; --rope-s: #eab308;
--layer: #f8fafc; --layer-s: #cbd5e1;
--io: #f1f5f9; --io-s: #94a3b8;
--soft: #fae8ff; --soft-s: #c026d3;
--add: #ffffff; --add-s: #475569;
--block-s: #94a3b8;
--residual: #f59e0b;
--added: #16a34a; --over: #d97706; --deleted: #dc2626;
--lt-full: #06b6d4; --lt-sliding: #3b82f6; --lt-chunked: #8b5cf6;
--lt-compressed: #f97316; --lt-heavy: #dc2626; --lt-linear: #ec4899; --lt-mamba: #22c55e;
--cell-on: #0ea5e9; --cell-off: #e5e7eb;
--vision: #dcfce7; --vision-s: #16a34a; --audio: #fae8ff; --audio-s: #c026d3;
--proj: #fef3c7; --proj-s: #d97706; --xattn: #db2777;
--conv: #d1fae5; --conv-s: #10b981; --act: #ecfccb; --act-s: #65a30d;
--pool: #e0f2fe; --pool-s: #0284c7; --quant: #fae8ff; --quant-s: #c026d3;
}
@media (prefers-color-scheme: dark) {
:root {
--bg: #0d1117; --fg: #e6edf3; --muted: #8b949e; --panel: #161b22; --grid: #30363d;
--embed: #172554; --attn: #083344; --mamba: #052e16; --linattn: #500724;
--recur: #2e1065; --moe: #431407; --mlp: #2e1065; --norm: #21262d; --head: #450a0a;
--config: #1e293b; --rope: #422006; --layer: #161b22;
--conv: #022c22; --act: #1a2e05; --pool: #082f49; --quant: #3b0764; --proj: #422006;
--io: #1e293b; --soft: #3b0764; --add: #0d1117; --cell-off: #21262d;
}
}
.bg { fill: var(--bg); }
text { font-family: ui-sans-serif, -apple-system, "Segoe UI", Roboto, sans-serif; fill: var(--fg); }
.title { font-size: 22px; font-weight: 700; }
.subtitle { font-size: 13px; fill: var(--muted); }
.box-label { font-size: 14px; font-weight: 600; }
.box-label.sm { font-size: 12.5px; }
.box-sub { font-size: 11px; fill: var(--muted); }
.glyph { font-size: 18px; font-weight: 700; fill: var(--add-s); }
.badge { font-size: 12px; font-weight: 700; fill: var(--fg); }
.facts-k { font-size: 11.5px; fill: var(--muted); }
.facts-v { font-size: 11.5px; font-weight: 600; }
.legend-t { font-size: 11.5px; fill: var(--fg); }
.panel { fill: var(--panel); stroke: var(--grid); }
rect.b { rx: 9; stroke-width: 1.6; }
.c-embed { fill: var(--embed); stroke: var(--embed-s); }
.c-attn { fill: var(--attn); stroke: var(--attn-s); }
.c-mamba { fill: var(--mamba); stroke: var(--mamba-s); }
.c-linattn { fill: var(--linattn); stroke: var(--linattn-s); }
.c-recur { fill: var(--recur); stroke: var(--recur-s); }
.c-moe { fill: var(--moe); stroke: var(--moe-s); }
.c-mlp { fill: var(--mlp); stroke: var(--mlp-s); }
.c-norm { fill: var(--norm); stroke: var(--norm-s); }
.c-head { fill: var(--head); stroke: var(--head-s); }
.c-config{ fill: var(--config);stroke: var(--config-s); }
.c-rope { fill: var(--rope); stroke: var(--rope-s); }
.c-proj { fill: var(--proj); stroke: var(--proj-s); }
.c-conv { fill: var(--conv); stroke: var(--conv-s); }
.c-act { fill: var(--act); stroke: var(--act-s); }
.c-pool { fill: var(--pool); stroke: var(--pool-s); }
.c-quant { fill: var(--quant); stroke: var(--quant-s); }
.c-layer { fill: var(--layer); stroke: var(--layer-s); }
.c-io { fill: var(--io); stroke: var(--io-s); }
.c-soft { fill: var(--soft); stroke: var(--soft-s); }
.c-add { fill: var(--add); stroke: var(--add-s); }
.c-block { fill: none; stroke: var(--block-s); stroke-width: 1.6; stroke-dasharray: 7 5; }
.c-lt-full { fill: var(--lt-full); stroke: var(--lt-full); }
.c-lt-sliding { fill: var(--lt-sliding); stroke: var(--lt-sliding); }
.c-lt-chunked { fill: var(--lt-chunked); stroke: var(--lt-chunked); }
.c-lt-compressed { fill: var(--lt-compressed); stroke: var(--lt-compressed); }
.c-lt-heavy { fill: var(--lt-heavy); stroke: var(--lt-heavy); }
.c-lt-linear { fill: var(--lt-linear); stroke: var(--lt-linear); }
.c-lt-mamba { fill: var(--lt-mamba); stroke: var(--lt-mamba); }
.cell-on { fill: var(--cell-on); }
.cell-off { fill: var(--cell-off); }
.grid-frame { fill: none; stroke: var(--grid); stroke-width: 1; }
.mask-bg { fill: var(--cell-off); }
.mask-on { fill: #22c55e; }
.mask-div { stroke: var(--fg); stroke-width: 1.5; stroke-dasharray: 3 2; }
.c-vision { fill: var(--vision); stroke: var(--vision-s); }
.c-audio { fill: var(--audio); stroke: var(--audio-s); }
.c-proj { fill: var(--proj); stroke: var(--proj-s); }
.c-sub { fill: var(--bg); stroke: var(--block-s); stroke-width: 1.2; }
.sec-h { font-size: 12px; font-weight: 700; }
.sec-hbar { fill: var(--bg); opacity: 0.82; }
.residual.xattn { stroke: var(--xattn); stroke-width: 2.4; }
.ghost { opacity: 0.32; stroke-dasharray: 4 3; }
.ch-added rect.b, rect.b.ch-added { stroke: var(--added); stroke-width: 3.2; }
.ch-over rect.b, rect.b.ch-over { stroke: var(--over); stroke-width: 3.2; }
.ch-deleted rect.b, rect.b.ch-deleted { stroke: var(--deleted); stroke-width: 3.2; }
.edge { stroke: var(--grid); stroke-width: 2; }
.flow { stroke: var(--grid); stroke-width: 2; fill: none; }
.residual { stroke: var(--residual); stroke-width: 2; fill: none; }
.rope { stroke: var(--rope-s); stroke-width: 2.2; fill: none; }
.xattn { stroke: var(--xattn); stroke-width: 2.4; fill: none; }
.cell-idx { font-size: 9px; fill: #ffffff; font-weight: 600; }
.sky { fill: #bae6fd; } .sun { fill: #fde047; } .hill { fill: #4ade80; }
</style>
<defs><marker id="ah-flow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--grid)"/></marker><marker id="ah-residual" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--residual)"/></marker><marker id="ah-rope" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--rope-s)"/></marker><marker id="ah-xattn" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--xattn)"/></marker></defs>
<rect class="bg" x="0" y="0" width="1200" height="1076"/>
<text class="title" x="24" y="34">audio_spectrogram_transformer</text>
<text class="subtitle" x="24" y="54">diff vs vit Β· 7 overridden Β· 12 added Β· 0 deleted Β· 5 new Β· 3 inherited-as-is</text>
<line class="edge" x1="485" y1="108" x2="485" y2="1034"/>
<g class=""><rect class="c-block" x="286" y="140" width="398" height="708" rx="14"/><text class="box-sub" x="298" y="158">ASTModel Β· base model</text></g>
<g class=""><title>12 Γ ASTLayer</title><rect class="c-block" x="294" y="228" width="382" height="580" rx="14"/><text class="box-sub" x="306" y="246">ASTLayer</text><text class="badge" x="664" y="248" text-anchor="end">Γ 12</text></g>
<g class=""><rect class="c-block" x="286" y="848" width="398" height="194" rx="14"/><text class="box-sub" x="298" y="866">ASTForAudioClassification Β· task head</text></g>
<g class="ghost"><rect class="b c-attn " x="320" y="292" width="330" height="280" rx="11"/><rect class="sec-hbar" x="325" y="296" width="320" height="18" rx="5"/><text class="sec-h" x="332" y="309">Self-Attention Β· ViTAttention β© inherited</text></g>
<g class="ghost"><rect class="b c-mlp " x="320" y="668" width="330" height="92" rx="11"/><rect class="sec-hbar" x="325" y="672" width="320" height="18" rx="5"/><text class="sec-h" x="332" y="685">MLP Β· ViTMLP β© inherited</text></g>
<polyline class="residual" points="320,250 286,250 286,599 320,599" stroke-dasharray="6 4" marker-end="url(#ah-residual)"/>
<polyline class="residual" points="320,599 270,599 270,787 320,787" stroke-dasharray="6 4" marker-end="url(#ah-residual)"/>
<g class=""><rect class="b c-io" x="320" y="108" width="330" height="32" rx="16"/><text class="box-label sm" x="485.0" y="122.0" text-anchor="middle">input audio</text><text class="box-sub" x="485.0" y="137.0" text-anchor="middle">input_features [1, n_mels, T]</text></g>
<g class=""><rect class="b c-embed ch-over" x="320" y="154" width="330" height="60" rx="9"/><text class="box-label sm" x="485.0" y="174.5" text-anchor="middle">Feature Projection</text><text class="box-sub" x="485.0" y="189.5" text-anchor="middle">conv/linear β [1, 6, 768]</text><text class="box-sub" x="485.0" y="204.5" text-anchor="middle">+ position embeddings</text><circle cx="332" cy="166" r="5" fill="var(--over)"/></g>
<g class="ghost"><rect class="b c-norm" x="320" y="250" width="330" height="28" rx="9"/><text class="box-label sm" x="485.0" y="262.0" text-anchor="middle">LayerNorm</text><text class="box-sub" x="485.0" y="277.0" text-anchor="middle">pre-attention [1, 6, 768]</text></g>
<g class="ghost"><title>q_proj: Linear [768β768]</title><rect class="b c-proj" x="330" y="318" width="151" height="22" rx="9"/><text class="box-label sm" x="405.5" y="333.5" text-anchor="middle">q_proj [768β768]</text></g>
<g class="ghost"><title>k_proj: Linear [768β768]</title><rect class="b c-proj" x="489" y="318" width="151" height="22" rx="9"/><text class="box-label sm" x="564.5" y="333.5" text-anchor="middle">k_proj [768β768]</text></g>
<g class="ghost"><title>v_proj: Linear [768β768]</title><rect class="b c-proj" x="330" y="348" width="151" height="22" rx="9"/><text class="box-label sm" x="405.5" y="363.5" text-anchor="middle">v_proj [768β768]</text></g>
<g class="ghost"><title>o_proj: Linear [768β768]</title><rect class="b c-proj" x="489" y="348" width="151" height="22" rx="9"/><text class="box-label sm" x="564.5" y="363.5" text-anchor="middle">o_proj [768β768]</text></g>
<g class="ghost"><rect class="b c-sub" x="330" y="378" width="310" height="30" rx="9"/><text class="box-label sm" x="485.0" y="391.0" text-anchor="middle">scaled dot-product attention</text><text class="box-sub" x="485.0" y="406.0" text-anchor="middle">12 heads Β· head_dim 64</text></g>
<g class=""><title>bidirectional attention mask (qβ Γ kβ)</title><text class="box-sub" x="415" y="420">bidirectional mask</text><rect class="mask-bg" x="415" y="426" width="140.0" height="140.0"/><rect class="mask-on" x="415.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="556.0" width="10.0" height="10.0"/><rect class="grid-frame" x="415" y="426" width="140.0" height="140.0"/></g>
<g class=""><title>residual add</title><circle class="b c-add " cx="485.0" cy="599" r="13"/><text class="glyph" x="485.0" y="605" text-anchor="middle">+</text></g>
<g class="ghost"><rect class="b c-norm" x="320" y="626" width="330" height="28" rx="9"/><text class="box-label sm" x="485.0" y="638.0" text-anchor="middle">LayerNorm</text><text class="box-sub" x="485.0" y="653.0" text-anchor="middle">pre-FFN [1, 6, 768]</text></g>
<g class="ghost"><title>activation_fn: GELUActivation</title><rect class="b c-act" x="332" y="694" width="149" height="22" rx="9"/><text class="box-label sm" x="406.5" y="709.5" text-anchor="middle">activation_fn GELU</text></g>
<g class="ghost"><title>fc1: Linear [768β3072]</title><rect class="b c-proj" x="489" y="694" width="149" height="22" rx="9"/><text class="box-label sm" x="563.5" y="709.5" text-anchor="middle">fc1 [768β3072]</text></g>
<g class="ghost"><title>fc2: Linear [3072β768]</title><rect class="b c-proj" x="332" y="724" width="306" height="22" rx="9"/><text class="box-label sm" x="485.0" y="739.5" text-anchor="middle">fc2 [3072β768]</text></g>
<g class=""><title>residual add</title><circle class="b c-add " cx="485.0" cy="787" r="13"/><text class="glyph" x="485.0" y="793" text-anchor="middle">+</text></g>
<g class="ghost"><rect class="b c-norm" x="320" y="814" width="330" height="28" rx="9"/><text class="box-label sm" x="485.0" y="826.0" text-anchor="middle">Final LayerNorm</text><text class="box-sub" x="485.0" y="841.0" text-anchor="middle">[1, 6, 768]</text></g>
<g class=""><title>ASTForAudioClassification</title><rect class="b c-head ch-over" x="320" y="856" width="330" height="40" rx="9"/><text class="box-label sm" x="485.0" y="874.0" text-anchor="middle">Pool (CLS / mean)</text><text class="box-sub" x="485.0" y="889.0" text-anchor="middle">[1, 6, 768] β [1, 768]</text><circle cx="332" cy="868" r="5" fill="var(--over)"/></g>
<g class=""><title>ASTForAudioClassification</title><rect class="b c-head ch-over" x="320" y="910" width="330" height="40" rx="9"/><text class="box-label sm" x="485.0" y="928.0" text-anchor="middle">Classifier head</text><text class="box-sub" x="485.0" y="943.0" text-anchor="middle">Linear [768β2]</text><circle cx="332" cy="922" r="5" fill="var(--over)"/></g>
<g class="ghost"><rect class="b c-soft" x="320" y="964" width="330" height="26" rx="9"/><text class="box-label sm" x="485.0" y="981.5" text-anchor="middle">Softmax</text></g>
<g class=""><rect class="b c-io" x="320" y="1004" width="330" height="30" rx="15"/><text class="box-label sm" x="485.0" y="1017.0" text-anchor="middle">class logits</text><text class="box-sub" x="485.0" y="1032.0" text-anchor="middle">[1, 2] (2 classes)</text></g>
<rect class="panel" x="912" y="88" width="264" height="174" rx="8"/>
<text class="facts-k" x="926" y="112">model id</text>
<text class="facts-v" x="1162" y="112" text-anchor="end">MIT/ast-finetuned-audioset-10-</text>
<text class="facts-k" x="926" y="134">parent</text>
<text class="facts-v" x="1162" y="134" text-anchor="end">vit</text>
<text class="facts-k" x="926" y="156">classes</text>
<text class="facts-v" x="1162" y="156" text-anchor="end">9</text>
<text class="facts-k" x="926" y="178">overridden</text>
<text class="facts-v" x="1162" y="178" text-anchor="end">7</text>
<text class="facts-k" x="926" y="200">added</text>
<text class="facts-v" x="1162" y="200" text-anchor="end">12</text>
<text class="facts-k" x="926" y="222">new classes</text>
<text class="facts-v" x="1162" y="222" text-anchor="end">5</text>
<text class="facts-k" x="926" y="244">inherited as-is</text>
<text class="facts-v" x="1162" y="244" text-anchor="end">3</text>
<text class="legend-t" x="914" y="282" font-weight="700">legend</text>
<rect x="916" y="293" width="16" height="12" rx="2" fill="none" stroke="var(--added)" stroke-width="3"/>
<text class="legend-t" x="940" y="303">new submodule (vs parent)</text>
<rect x="916" y="311" width="16" height="12" rx="2" fill="none" stroke="var(--over)" stroke-width="3"/>
<text class="legend-t" x="940" y="321">changed / redefined</text>
<rect x="916" y="329" width="16" height="12" rx="2" fill="none" stroke="var(--deleted)" stroke-width="3"/>
<text class="legend-t" x="940" y="339">deleted</text>
<rect x="916" y="347" width="16" height="12" rx="2" fill="none" stroke="var(--grid)" stroke-width="3"/>
<text class="legend-t" x="940" y="357">inherited / copy-pasted</text>
<text class="legend-t" x="914" y="380" font-weight="700">changes by class</text>
<circle cx="920" cy="393" r="4" fill="var(--over)"/>
<text class="facts-v" x="932" y="396">ASTPreTrainedModel</text>
<text class="box-sub" x="932" y="409">ovr _init_weights; 6 attr</text>
<circle cx="920" cy="421" r="4" fill="var(--added)"/>
<text class="facts-v" x="932" y="424">ASTEmbeddings</text>
<text class="box-sub" x="932" y="437">add __init__,forward,get_shape</text>
<circle cx="920" cy="449" r="4" fill="var(--added)"/>
<text class="facts-v" x="932" y="452">ASTForAudioClassification</text>
<text class="box-sub" x="932" y="465">add __init__,forward</text>
<circle cx="920" cy="477" r="4" fill="var(--added)"/>
<text class="facts-v" x="932" y="480">ASTMLPHead</text>
<text class="box-sub" x="932" y="493">add __init__,forward</text>
<circle cx="920" cy="505" r="4" fill="var(--added)"/>
<text class="facts-v" x="932" y="508">ASTModel</text>
<text class="box-sub" x="932" y="521">add __init__,forward,get_input_embeddings</text>
<circle cx="920" cy="533" r="4" fill="var(--added)"/>
<text class="facts-v" x="932" y="536">ASTPatchEmbeddings</text>
<text class="box-sub" x="932" y="549">add __init__,forward</text>
</svg> |