| <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1200 1076" width="1200" height="1076" font-size="14"> |
| <style> |
| :root { |
| --bg: #ffffff; --fg: #1b1f24; --muted: #6b7280; --panel: #f6f8fa; --grid: #e5e7eb; |
| --embed: #dbeafe; --embed-s: #3b82f6; |
| --attn: #cffafe; --attn-s: #06b6d4; |
| --mamba: #dcfce7; --mamba-s: #22c55e; |
| --linattn: #fce7f3; --linattn-s: #ec4899; |
| --recur: #ede9fe; --recur-s: #8b5cf6; |
| --moe: #ffedd5; --moe-s: #f97316; |
| --mlp: #ede9fe; --mlp-s: #8b5cf6; |
| --norm: #e5e7eb; --norm-s: #9ca3af; |
| --head: #fee2e2; --head-s: #ef4444; |
| --config: #f1f5f9; --config-s: #64748b; |
| --rope: #fef9c3; --rope-s: #eab308; |
| --layer: #f8fafc; --layer-s: #cbd5e1; |
| --io: #f1f5f9; --io-s: #94a3b8; |
| --soft: #fae8ff; --soft-s: #c026d3; |
| --add: #ffffff; --add-s: #475569; |
| --block-s: #94a3b8; |
| --residual: #f59e0b; |
| --added: #16a34a; --over: #d97706; --deleted: #dc2626; |
| --lt-full: #06b6d4; --lt-sliding: #3b82f6; --lt-chunked: #8b5cf6; |
| --lt-compressed: #f97316; --lt-heavy: #dc2626; --lt-linear: #ec4899; --lt-mamba: #22c55e; |
| --cell-on: #0ea5e9; --cell-off: #e5e7eb; |
| --vision: #dcfce7; --vision-s: #16a34a; --audio: #fae8ff; --audio-s: #c026d3; |
| --proj: #fef3c7; --proj-s: #d97706; --xattn: #db2777; |
| --conv: #d1fae5; --conv-s: #10b981; --act: #ecfccb; --act-s: #65a30d; |
| --pool: #e0f2fe; --pool-s: #0284c7; --quant: #fae8ff; --quant-s: #c026d3; |
| } |
| @media (prefers-color-scheme: dark) { |
| :root { |
| --bg: #0d1117; --fg: #e6edf3; --muted: #8b949e; --panel: #161b22; --grid: #30363d; |
| --embed: #172554; --attn: #083344; --mamba: #052e16; --linattn: #500724; |
| --recur: #2e1065; --moe: #431407; --mlp: #2e1065; --norm: #21262d; --head: #450a0a; |
| --config: #1e293b; --rope: #422006; --layer: #161b22; |
| --conv: #022c22; --act: #1a2e05; --pool: #082f49; --quant: #3b0764; --proj: #422006; |
| --io: #1e293b; --soft: #3b0764; --add: #0d1117; --cell-off: #21262d; |
| } |
| } |
| .bg { fill: var(--bg); } |
| text { font-family: ui-sans-serif, -apple-system, "Segoe UI", Roboto, sans-serif; fill: var(--fg); } |
| .title { font-size: 22px; font-weight: 700; } |
| .subtitle { font-size: 13px; fill: var(--muted); } |
| .box-label { font-size: 14px; font-weight: 600; } |
| .box-label.sm { font-size: 12.5px; } |
| .box-sub { font-size: 11px; fill: var(--muted); } |
| .glyph { font-size: 18px; font-weight: 700; fill: var(--add-s); } |
| .badge { font-size: 12px; font-weight: 700; fill: var(--fg); } |
| .facts-k { font-size: 11.5px; fill: var(--muted); } |
| .facts-v { font-size: 11.5px; font-weight: 600; } |
| .legend-t { font-size: 11.5px; fill: var(--fg); } |
| .panel { fill: var(--panel); stroke: var(--grid); } |
| rect.b { rx: 9; stroke-width: 1.6; } |
| .c-embed { fill: var(--embed); stroke: var(--embed-s); } |
| .c-attn { fill: var(--attn); stroke: var(--attn-s); } |
| .c-mamba { fill: var(--mamba); stroke: var(--mamba-s); } |
| .c-linattn { fill: var(--linattn); stroke: var(--linattn-s); } |
| .c-recur { fill: var(--recur); stroke: var(--recur-s); } |
| .c-moe { fill: var(--moe); stroke: var(--moe-s); } |
| .c-mlp { fill: var(--mlp); stroke: var(--mlp-s); } |
| .c-norm { fill: var(--norm); stroke: var(--norm-s); } |
| .c-head { fill: var(--head); stroke: var(--head-s); } |
| .c-config{ fill: var(--config);stroke: var(--config-s); } |
| .c-rope { fill: var(--rope); stroke: var(--rope-s); } |
| .c-proj { fill: var(--proj); stroke: var(--proj-s); } |
| .c-conv { fill: var(--conv); stroke: var(--conv-s); } |
| .c-act { fill: var(--act); stroke: var(--act-s); } |
| .c-pool { fill: var(--pool); stroke: var(--pool-s); } |
| .c-quant { fill: var(--quant); stroke: var(--quant-s); } |
| .c-layer { fill: var(--layer); stroke: var(--layer-s); } |
| .c-io { fill: var(--io); stroke: var(--io-s); } |
| .c-soft { fill: var(--soft); stroke: var(--soft-s); } |
| .c-add { fill: var(--add); stroke: var(--add-s); } |
| .c-block { fill: none; stroke: var(--block-s); stroke-width: 1.6; stroke-dasharray: 7 5; } |
| .c-lt-full { fill: var(--lt-full); stroke: var(--lt-full); } |
| .c-lt-sliding { fill: var(--lt-sliding); stroke: var(--lt-sliding); } |
| .c-lt-chunked { fill: var(--lt-chunked); stroke: var(--lt-chunked); } |
| .c-lt-compressed { fill: var(--lt-compressed); stroke: var(--lt-compressed); } |
| .c-lt-heavy { fill: var(--lt-heavy); stroke: var(--lt-heavy); } |
| .c-lt-linear { fill: var(--lt-linear); stroke: var(--lt-linear); } |
| .c-lt-mamba { fill: var(--lt-mamba); stroke: var(--lt-mamba); } |
| .cell-on { fill: var(--cell-on); } |
| .cell-off { fill: var(--cell-off); } |
| .grid-frame { fill: none; stroke: var(--grid); stroke-width: 1; } |
| .mask-bg { fill: var(--cell-off); } |
| .mask-on { fill: #22c55e; } |
| .mask-div { stroke: var(--fg); stroke-width: 1.5; stroke-dasharray: 3 2; } |
| .c-vision { fill: var(--vision); stroke: var(--vision-s); } |
| .c-audio { fill: var(--audio); stroke: var(--audio-s); } |
| .c-proj { fill: var(--proj); stroke: var(--proj-s); } |
| .c-sub { fill: var(--bg); stroke: var(--block-s); stroke-width: 1.2; } |
| .sec-h { font-size: 12px; font-weight: 700; } |
| .sec-hbar { fill: var(--bg); opacity: 0.82; } |
| .residual.xattn { stroke: var(--xattn); stroke-width: 2.4; } |
| .ghost { opacity: 0.32; stroke-dasharray: 4 3; } |
| .ch-added rect.b, rect.b.ch-added { stroke: var(--added); stroke-width: 3.2; } |
| .ch-over rect.b, rect.b.ch-over { stroke: var(--over); stroke-width: 3.2; } |
| .ch-deleted rect.b, rect.b.ch-deleted { stroke: var(--deleted); stroke-width: 3.2; } |
| .edge { stroke: var(--grid); stroke-width: 2; } |
| .flow { stroke: var(--grid); stroke-width: 2; fill: none; } |
| .residual { stroke: var(--residual); stroke-width: 2; fill: none; } |
| .rope { stroke: var(--rope-s); stroke-width: 2.2; fill: none; } |
| .xattn { stroke: var(--xattn); stroke-width: 2.4; fill: none; } |
| .cell-idx { font-size: 9px; fill: #ffffff; font-weight: 600; } |
| .sky { fill: #bae6fd; } .sun { fill: #fde047; } .hill { fill: #4ade80; } |
| </style> |
| <defs><marker id="ah-flow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--grid)"/></marker><marker id="ah-residual" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--residual)"/></marker><marker id="ah-rope" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--rope-s)"/></marker><marker id="ah-xattn" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="7" markerHeight="7" orient="auto-start-reverse"><path d="M0,0 L10,5 L0,10 z" fill="var(--xattn)"/></marker></defs> |
| <rect class="bg" x="0" y="0" width="1200" height="1076"/> |
| <text class="title" x="24" y="34">audio_spectrogram_transformer</text> |
| <text class="subtitle" x="24" y="54">diff vs vit Β· 7 overridden Β· 12 added Β· 0 deleted Β· 5 new Β· 3 inherited-as-is</text> |
| <line class="edge" x1="485" y1="108" x2="485" y2="1034"/> |
| <g class=""><rect class="c-block" x="286" y="140" width="398" height="708" rx="14"/><text class="box-sub" x="298" y="158">ASTModel Β· base model</text></g> |
| <g class=""><title>12 Γ ASTLayer</title><rect class="c-block" x="294" y="228" width="382" height="580" rx="14"/><text class="box-sub" x="306" y="246">ASTLayer</text><text class="badge" x="664" y="248" text-anchor="end">Γ 12</text></g> |
| <g class=""><rect class="c-block" x="286" y="848" width="398" height="194" rx="14"/><text class="box-sub" x="298" y="866">ASTForAudioClassification Β· task head</text></g> |
| <g class="ghost"><rect class="b c-attn " x="320" y="292" width="330" height="280" rx="11"/><rect class="sec-hbar" x="325" y="296" width="320" height="18" rx="5"/><text class="sec-h" x="332" y="309">Self-Attention Β· ViTAttention β© inherited</text></g> |
| <g class="ghost"><rect class="b c-mlp " x="320" y="668" width="330" height="92" rx="11"/><rect class="sec-hbar" x="325" y="672" width="320" height="18" rx="5"/><text class="sec-h" x="332" y="685">MLP Β· ViTMLP β© inherited</text></g> |
| <polyline class="residual" points="320,250 286,250 286,599 320,599" stroke-dasharray="6 4" marker-end="url(#ah-residual)"/> |
| <polyline class="residual" points="320,599 270,599 270,787 320,787" stroke-dasharray="6 4" marker-end="url(#ah-residual)"/> |
| <g class=""><rect class="b c-io" x="320" y="108" width="330" height="32" rx="16"/><text class="box-label sm" x="485.0" y="122.0" text-anchor="middle">input audio</text><text class="box-sub" x="485.0" y="137.0" text-anchor="middle">input_features [1, n_mels, T]</text></g> |
| <g class=""><rect class="b c-embed ch-over" x="320" y="154" width="330" height="60" rx="9"/><text class="box-label sm" x="485.0" y="174.5" text-anchor="middle">Feature Projection</text><text class="box-sub" x="485.0" y="189.5" text-anchor="middle">conv/linear β [1, 6, 768]</text><text class="box-sub" x="485.0" y="204.5" text-anchor="middle">+ position embeddings</text><circle cx="332" cy="166" r="5" fill="var(--over)"/></g> |
| <g class="ghost"><rect class="b c-norm" x="320" y="250" width="330" height="28" rx="9"/><text class="box-label sm" x="485.0" y="262.0" text-anchor="middle">LayerNorm</text><text class="box-sub" x="485.0" y="277.0" text-anchor="middle">pre-attention [1, 6, 768]</text></g> |
| <g class="ghost"><title>q_proj: Linear [768β768]</title><rect class="b c-proj" x="330" y="318" width="151" height="22" rx="9"/><text class="box-label sm" x="405.5" y="333.5" text-anchor="middle">q_proj [768β768]</text></g> |
| <g class="ghost"><title>k_proj: Linear [768β768]</title><rect class="b c-proj" x="489" y="318" width="151" height="22" rx="9"/><text class="box-label sm" x="564.5" y="333.5" text-anchor="middle">k_proj [768β768]</text></g> |
| <g class="ghost"><title>v_proj: Linear [768β768]</title><rect class="b c-proj" x="330" y="348" width="151" height="22" rx="9"/><text class="box-label sm" x="405.5" y="363.5" text-anchor="middle">v_proj [768β768]</text></g> |
| <g class="ghost"><title>o_proj: Linear [768β768]</title><rect class="b c-proj" x="489" y="348" width="151" height="22" rx="9"/><text class="box-label sm" x="564.5" y="363.5" text-anchor="middle">o_proj [768β768]</text></g> |
| <g class="ghost"><rect class="b c-sub" x="330" y="378" width="310" height="30" rx="9"/><text class="box-label sm" x="485.0" y="391.0" text-anchor="middle">scaled dot-product attention</text><text class="box-sub" x="485.0" y="406.0" text-anchor="middle">12 heads Β· head_dim 64</text></g> |
| <g class=""><title>bidirectional attention mask (qβ Γ kβ)</title><text class="box-sub" x="415" y="420">bidirectional mask</text><rect class="mask-bg" x="415" y="426" width="140.0" height="140.0"/><rect class="mask-on" x="415.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="426.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="436.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="446.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="456.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="466.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="476.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="486.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="496.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="506.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="516.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="526.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="536.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="546.0" width="10.0" height="10.0"/><rect class="mask-on" x="415.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="425.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="435.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="445.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="455.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="465.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="475.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="485.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="495.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="505.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="515.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="525.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="535.0" y="556.0" width="10.0" height="10.0"/><rect class="mask-on" x="545.0" y="556.0" width="10.0" height="10.0"/><rect class="grid-frame" x="415" y="426" width="140.0" height="140.0"/></g> |
| <g class=""><title>residual add</title><circle class="b c-add " cx="485.0" cy="599" r="13"/><text class="glyph" x="485.0" y="605" text-anchor="middle">+</text></g> |
| <g class="ghost"><rect class="b c-norm" x="320" y="626" width="330" height="28" rx="9"/><text class="box-label sm" x="485.0" y="638.0" text-anchor="middle">LayerNorm</text><text class="box-sub" x="485.0" y="653.0" text-anchor="middle">pre-FFN [1, 6, 768]</text></g> |
| <g class="ghost"><title>activation_fn: GELUActivation</title><rect class="b c-act" x="332" y="694" width="149" height="22" rx="9"/><text class="box-label sm" x="406.5" y="709.5" text-anchor="middle">activation_fn GELU</text></g> |
| <g class="ghost"><title>fc1: Linear [768β3072]</title><rect class="b c-proj" x="489" y="694" width="149" height="22" rx="9"/><text class="box-label sm" x="563.5" y="709.5" text-anchor="middle">fc1 [768β3072]</text></g> |
| <g class="ghost"><title>fc2: Linear [3072β768]</title><rect class="b c-proj" x="332" y="724" width="306" height="22" rx="9"/><text class="box-label sm" x="485.0" y="739.5" text-anchor="middle">fc2 [3072β768]</text></g> |
| <g class=""><title>residual add</title><circle class="b c-add " cx="485.0" cy="787" r="13"/><text class="glyph" x="485.0" y="793" text-anchor="middle">+</text></g> |
| <g class="ghost"><rect class="b c-norm" x="320" y="814" width="330" height="28" rx="9"/><text class="box-label sm" x="485.0" y="826.0" text-anchor="middle">Final LayerNorm</text><text class="box-sub" x="485.0" y="841.0" text-anchor="middle">[1, 6, 768]</text></g> |
| <g class=""><title>ASTForAudioClassification</title><rect class="b c-head ch-over" x="320" y="856" width="330" height="40" rx="9"/><text class="box-label sm" x="485.0" y="874.0" text-anchor="middle">Pool (CLS / mean)</text><text class="box-sub" x="485.0" y="889.0" text-anchor="middle">[1, 6, 768] β [1, 768]</text><circle cx="332" cy="868" r="5" fill="var(--over)"/></g> |
| <g class=""><title>ASTForAudioClassification</title><rect class="b c-head ch-over" x="320" y="910" width="330" height="40" rx="9"/><text class="box-label sm" x="485.0" y="928.0" text-anchor="middle">Classifier head</text><text class="box-sub" x="485.0" y="943.0" text-anchor="middle">Linear [768β2]</text><circle cx="332" cy="922" r="5" fill="var(--over)"/></g> |
| <g class="ghost"><rect class="b c-soft" x="320" y="964" width="330" height="26" rx="9"/><text class="box-label sm" x="485.0" y="981.5" text-anchor="middle">Softmax</text></g> |
| <g class=""><rect class="b c-io" x="320" y="1004" width="330" height="30" rx="15"/><text class="box-label sm" x="485.0" y="1017.0" text-anchor="middle">class logits</text><text class="box-sub" x="485.0" y="1032.0" text-anchor="middle">[1, 2] (2 classes)</text></g> |
| <rect class="panel" x="912" y="88" width="264" height="174" rx="8"/> |
| <text class="facts-k" x="926" y="112">model id</text> |
| <text class="facts-v" x="1162" y="112" text-anchor="end">MIT/ast-finetuned-audioset-10-</text> |
| <text class="facts-k" x="926" y="134">parent</text> |
| <text class="facts-v" x="1162" y="134" text-anchor="end">vit</text> |
| <text class="facts-k" x="926" y="156">classes</text> |
| <text class="facts-v" x="1162" y="156" text-anchor="end">9</text> |
| <text class="facts-k" x="926" y="178">overridden</text> |
| <text class="facts-v" x="1162" y="178" text-anchor="end">7</text> |
| <text class="facts-k" x="926" y="200">added</text> |
| <text class="facts-v" x="1162" y="200" text-anchor="end">12</text> |
| <text class="facts-k" x="926" y="222">new classes</text> |
| <text class="facts-v" x="1162" y="222" text-anchor="end">5</text> |
| <text class="facts-k" x="926" y="244">inherited as-is</text> |
| <text class="facts-v" x="1162" y="244" text-anchor="end">3</text> |
| <text class="legend-t" x="914" y="282" font-weight="700">legend</text> |
| <rect x="916" y="293" width="16" height="12" rx="2" fill="none" stroke="var(--added)" stroke-width="3"/> |
| <text class="legend-t" x="940" y="303">new submodule (vs parent)</text> |
| <rect x="916" y="311" width="16" height="12" rx="2" fill="none" stroke="var(--over)" stroke-width="3"/> |
| <text class="legend-t" x="940" y="321">changed / redefined</text> |
| <rect x="916" y="329" width="16" height="12" rx="2" fill="none" stroke="var(--deleted)" stroke-width="3"/> |
| <text class="legend-t" x="940" y="339">deleted</text> |
| <rect x="916" y="347" width="16" height="12" rx="2" fill="none" stroke="var(--grid)" stroke-width="3"/> |
| <text class="legend-t" x="940" y="357">inherited / copy-pasted</text> |
| <text class="legend-t" x="914" y="380" font-weight="700">changes by class</text> |
| <circle cx="920" cy="393" r="4" fill="var(--over)"/> |
| <text class="facts-v" x="932" y="396">ASTPreTrainedModel</text> |
| <text class="box-sub" x="932" y="409">ovr _init_weights; 6 attr</text> |
| <circle cx="920" cy="421" r="4" fill="var(--added)"/> |
| <text class="facts-v" x="932" y="424">ASTEmbeddings</text> |
| <text class="box-sub" x="932" y="437">add __init__,forward,get_shape</text> |
| <circle cx="920" cy="449" r="4" fill="var(--added)"/> |
| <text class="facts-v" x="932" y="452">ASTForAudioClassification</text> |
| <text class="box-sub" x="932" y="465">add __init__,forward</text> |
| <circle cx="920" cy="477" r="4" fill="var(--added)"/> |
| <text class="facts-v" x="932" y="480">ASTMLPHead</text> |
| <text class="box-sub" x="932" y="493">add __init__,forward</text> |
| <circle cx="920" cy="505" r="4" fill="var(--added)"/> |
| <text class="facts-v" x="932" y="508">ASTModel</text> |
| <text class="box-sub" x="932" y="521">add __init__,forward,get_input_embeddings</text> |
| <circle cx="920" cy="533" r="4" fill="var(--added)"/> |
| <text class="facts-v" x="932" y="536">ASTPatchEmbeddings</text> |
| <text class="box-sub" x="932" y="549">add __init__,forward</text> |
| </svg> |