|
|
<?xml version="1.0" encoding="UTF-8" standalone="no"?> |
|
|
<svg |
|
|
width="1200" |
|
|
height="1400" |
|
|
font-family="Arial, sans-serif" |
|
|
version="1.1" |
|
|
id="svg71" |
|
|
sodipodi:docname="svgviewer-output.svg" |
|
|
inkscape:version="1.4.2 (2aeb623e1d, 2025-05-12)" |
|
|
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" |
|
|
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" |
|
|
xmlns="http://www.w3.org/2000/svg" |
|
|
xmlns:svg="http://www.w3.org/2000/svg"> |
|
|
<sodipodi:namedview |
|
|
id="namedview71" |
|
|
pagecolor="#505050" |
|
|
bordercolor="#eeeeee" |
|
|
borderopacity="1" |
|
|
inkscape:showpageshadow="0" |
|
|
inkscape:pageopacity="0" |
|
|
inkscape:pagecheckerboard="0" |
|
|
inkscape:deskcolor="#d1d1d1" |
|
|
showgrid="false" |
|
|
inkscape:zoom="1.1778379" |
|
|
inkscape:cx="615.11013" |
|
|
inkscape:cy="400.73427" |
|
|
inkscape:window-width="2494" |
|
|
inkscape:window-height="1371" |
|
|
inkscape:window-x="66" |
|
|
inkscape:window-y="32" |
|
|
inkscape:window-maximized="1" |
|
|
inkscape:current-layer="svg71" /> |
|
|
<defs |
|
|
id="defs9"> |
|
|
<marker |
|
|
id="arrow" |
|
|
markerWidth="10" |
|
|
markerHeight="7" |
|
|
refX="9" |
|
|
refY="3.5" |
|
|
orient="auto"> |
|
|
<polygon |
|
|
points="0 0, 10 3.5, 0 7" |
|
|
fill="#333" |
|
|
id="polygon1" /> |
|
|
</marker> |
|
|
|
|
|
<linearGradient |
|
|
id="mambaGrad" |
|
|
x1="0%" |
|
|
y1="0%" |
|
|
x2="0%" |
|
|
y2="100%"> |
|
|
<stop |
|
|
offset="0%" |
|
|
style="stop-color:#E3F2FD;stop-opacity:1" |
|
|
id="stop1" /> |
|
|
<stop |
|
|
offset="100%" |
|
|
style="stop-color:#BBDEFB;stop-opacity:1" |
|
|
id="stop2" /> |
|
|
</linearGradient> |
|
|
<linearGradient |
|
|
id="mambaEnhGrad" |
|
|
x1="25.226249" |
|
|
y1="1783.8562" |
|
|
x2="25.226249" |
|
|
y2="2338.8337" |
|
|
gradientTransform="matrix(1.9820624,0,0,0.50452498,0,37.356585)" |
|
|
gradientUnits="userSpaceOnUse"> |
|
|
<stop |
|
|
offset="0%" |
|
|
style="stop-color:#81D4FA;stop-opacity:1" |
|
|
id="stop3" /> |
|
|
<stop |
|
|
offset="100%" |
|
|
style="stop-color:#4FC3F7;stop-opacity:1" |
|
|
id="stop4" /> |
|
|
</linearGradient> |
|
|
<linearGradient |
|
|
id="moeGrad" |
|
|
x1="33.028913" |
|
|
y1="681.22133" |
|
|
x2="33.028913" |
|
|
y2="1407.8574" |
|
|
gradientTransform="matrix(1.5138252,0,0,0.66057826,0,-2.5470399)" |
|
|
gradientUnits="userSpaceOnUse"> |
|
|
<stop |
|
|
offset="0%" |
|
|
style="stop-color:#E8F5E9;stop-opacity:1" |
|
|
id="stop5" /> |
|
|
<stop |
|
|
offset="100%" |
|
|
style="stop-color:#C8E6C9;stop-opacity:1" |
|
|
id="stop6" /> |
|
|
</linearGradient> |
|
|
|
|
|
<pattern |
|
|
id="expertPattern" |
|
|
x="0" |
|
|
y="0" |
|
|
width="20" |
|
|
height="20" |
|
|
patternUnits="userSpaceOnUse"> |
|
|
<circle |
|
|
cx="5" |
|
|
cy="5" |
|
|
r="2" |
|
|
fill="#4CAF50" |
|
|
opacity="0.3" |
|
|
id="circle6" /> |
|
|
<circle |
|
|
cx="15" |
|
|
cy="5" |
|
|
r="2" |
|
|
fill="#4CAF50" |
|
|
opacity="0.3" |
|
|
id="circle7" /> |
|
|
<circle |
|
|
cx="5" |
|
|
cy="15" |
|
|
r="2" |
|
|
fill="#4CAF50" |
|
|
opacity="0.3" |
|
|
id="circle8" /> |
|
|
<circle |
|
|
cx="15" |
|
|
cy="15" |
|
|
r="2" |
|
|
fill="#4CAF50" |
|
|
opacity="0.3" |
|
|
id="circle9" /> |
|
|
</pattern> |
|
|
</defs> |
|
|
<style |
|
|
id="style9"> |
|
|
.zone-label { font-size: 18px; font-weight: bold; fill: #1a1a1a; } |
|
|
.layer-title { font-size: 15px; font-weight: bold; fill: #1a1a1a; } |
|
|
.layer-subtitle { font-size: 11px; fill: #555; } |
|
|
.param-text { font-size: 10px; fill: #666; font-style: italic; } |
|
|
.component-text { font-size: 12px; fill: #333; font-weight: 500; } |
|
|
.arrow-line { stroke: #333; stroke-width: 2; fill: none; marker-end: url(#arrow); } |
|
|
.arrow-line-thin { stroke: #666; stroke-width: 1.5; fill: none; marker-end: url(#arrow); } |
|
|
.expert-box { fill: #fff; stroke: #4CAF50; stroke-width: 1.5; rx: 3; } |
|
|
.router-box { fill: #FFF3E0; stroke: #FF9800; stroke-width: 1.5; rx: 3; } |
|
|
.component-box { fill: #f5f5f5; stroke: #999; stroke-width: 1.5; rx: 4; } |
|
|
.zone-bg { opacity: 0.15; } |
|
|
.legend-text { font-size: 11px; fill: #333; } |
|
|
.title-main { font-size: 24px; font-weight: bold; fill: #1a1a1a; } |
|
|
.subtitle { font-size: 14px; fill: #666; } |
|
|
</style> |
|
|
|
|
|
<rect |
|
|
width="1200" |
|
|
height="1421.6123" |
|
|
fill="#fafafa" |
|
|
id="rect9" |
|
|
x="0" |
|
|
y="0" |
|
|
style="stroke-width:1.00769" /> |
|
|
|
|
|
<text |
|
|
x="600" |
|
|
y="35" |
|
|
text-anchor="middle" |
|
|
class="title-main" |
|
|
id="text9">AdaptiveRiverLM Architecture</text> |
|
|
<text |
|
|
x="600" |
|
|
y="55" |
|
|
text-anchor="middle" |
|
|
class="subtitle" |
|
|
id="text10">Hybrid Mamba-SSM + MoE Transformer (~1B parameters, 24 layers)</text> |
|
|
|
|
|
<rect |
|
|
x="450" |
|
|
y="90" |
|
|
width="300" |
|
|
height="40" |
|
|
rx="20" |
|
|
fill="#E0E0E0" |
|
|
stroke="#757575" |
|
|
stroke-width="2" |
|
|
id="rect10" /> |
|
|
<text |
|
|
x="600" |
|
|
y="115" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
id="text11">Input Token IDs → Embedding (d=1024)</text> |
|
|
<line |
|
|
x1="600" |
|
|
y1="130" |
|
|
x2="600" |
|
|
y2="160" |
|
|
class="arrow-line" |
|
|
id="line11" /> |
|
|
|
|
|
<rect |
|
|
x="50" |
|
|
y="160" |
|
|
width="1100" |
|
|
height="280" |
|
|
rx="10" |
|
|
fill="url(#mambaGrad)" |
|
|
class="zone-bg" |
|
|
id="rect11" /> |
|
|
<rect |
|
|
x="50" |
|
|
y="160" |
|
|
width="1100" |
|
|
height="280" |
|
|
rx="10" |
|
|
fill="none" |
|
|
stroke="#1976D2" |
|
|
stroke-width="2" |
|
|
stroke-dasharray="5,5" |
|
|
id="rect12" /> |
|
|
<text |
|
|
x="70" |
|
|
y="185" |
|
|
class="zone-label" |
|
|
id="text12">Zone 1: Early Mamba (Layers 0-1)</text> |
|
|
<text |
|
|
x="70" |
|
|
y="202" |
|
|
class="param-text" |
|
|
id="text13">Fast sequence processing • O(n) complexity • d_state=16, expand=2</text> |
|
|
|
|
|
<g |
|
|
id="mamba-layer-0"> |
|
|
<rect |
|
|
x="200" |
|
|
y="220" |
|
|
width="800" |
|
|
height="90" |
|
|
rx="6" |
|
|
fill="#fff" |
|
|
stroke="#1976D2" |
|
|
stroke-width="2" |
|
|
id="rect13" /> |
|
|
<text |
|
|
x="600" |
|
|
y="242" |
|
|
text-anchor="middle" |
|
|
class="layer-title" |
|
|
id="text14">Layer 0: MambaBlock</text> |
|
|
|
|
|
<rect |
|
|
x="220" |
|
|
y="255" |
|
|
width="150" |
|
|
height="45" |
|
|
class="component-box" |
|
|
id="rect14" /> |
|
|
<text |
|
|
x="295" |
|
|
y="272" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
id="text15">LayerNorm</text> |
|
|
<text |
|
|
x="295" |
|
|
y="287" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text16">↓</text> |
|
|
<text |
|
|
x="295" |
|
|
y="297" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text17">Mamba SSM</text> |
|
|
<line |
|
|
x1="370" |
|
|
y1="277" |
|
|
x2="410" |
|
|
y2="277" |
|
|
class="arrow-line-thin" |
|
|
id="line17" /> |
|
|
<rect |
|
|
x="410" |
|
|
y="255" |
|
|
width="110" |
|
|
height="45" |
|
|
fill="#E3F2FD" |
|
|
stroke="#1976D2" |
|
|
stroke-width="1.5" |
|
|
rx="4" |
|
|
id="rect17" /> |
|
|
<text |
|
|
x="465" |
|
|
y="272" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text18">Selective SSM</text> |
|
|
<text |
|
|
x="465" |
|
|
y="285" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text19">State: 16</text> |
|
|
<text |
|
|
x="465" |
|
|
y="296" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text20">Conv: 4</text> |
|
|
<line |
|
|
x1="520" |
|
|
y1="277" |
|
|
x2="560" |
|
|
y2="277" |
|
|
class="arrow-line-thin" |
|
|
id="line20" /> |
|
|
<rect |
|
|
x="560" |
|
|
y="255" |
|
|
width="130" |
|
|
height="45" |
|
|
class="component-box" |
|
|
id="rect20" /> |
|
|
<text |
|
|
x="625" |
|
|
y="275" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
id="text21">Residual +</text> |
|
|
<text |
|
|
x="625" |
|
|
y="290" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text22">LayerNorm → FFN</text> |
|
|
<line |
|
|
x1="690" |
|
|
y1="277" |
|
|
x2="730" |
|
|
y2="277" |
|
|
class="arrow-line-thin" |
|
|
id="line22" /> |
|
|
<rect |
|
|
x="730" |
|
|
y="255" |
|
|
width="120" |
|
|
height="45" |
|
|
class="component-box" |
|
|
id="rect22" /> |
|
|
<text |
|
|
x="790" |
|
|
y="272" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
id="text23">GeLU FFN</text> |
|
|
<text |
|
|
x="790" |
|
|
y="287" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text24">4096 hidden</text> |
|
|
<text |
|
|
x="790" |
|
|
y="297" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text25">expand=2</text> |
|
|
<rect |
|
|
x="870" |
|
|
y="255" |
|
|
width="110" |
|
|
height="45" |
|
|
class="component-box" |
|
|
id="rect25" /> |
|
|
<text |
|
|
x="925" |
|
|
y="280" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
id="text26">Residual +</text> |
|
|
</g> |
|
|
<line |
|
|
x1="600" |
|
|
y1="310" |
|
|
x2="600" |
|
|
y2="330" |
|
|
class="arrow-line" |
|
|
id="line26" /> |
|
|
|
|
|
<rect |
|
|
x="300" |
|
|
y="330" |
|
|
width="600" |
|
|
height="50" |
|
|
rx="6" |
|
|
fill="#fff" |
|
|
stroke="#1976D2" |
|
|
stroke-width="2" |
|
|
id="rect26" /> |
|
|
<text |
|
|
x="600" |
|
|
y="355" |
|
|
text-anchor="middle" |
|
|
class="layer-title" |
|
|
id="text27">Layer 1: MambaBlock</text> |
|
|
<text |
|
|
x="600" |
|
|
y="370" |
|
|
text-anchor="middle" |
|
|
class="layer-subtitle" |
|
|
id="text28">(same structure as Layer 0)</text> |
|
|
<line |
|
|
x1="600" |
|
|
y1="380" |
|
|
x2="600" |
|
|
y2="410" |
|
|
class="arrow-line" |
|
|
id="line28" /> |
|
|
|
|
|
<rect |
|
|
x="50" |
|
|
y="447.45297" |
|
|
width="1100" |
|
|
height="480" |
|
|
rx="10" |
|
|
fill="url(#moeGrad)" |
|
|
class="zone-bg" |
|
|
id="rect28" |
|
|
style="fill:url(#moeGrad)" /> |
|
|
<rect |
|
|
x="50" |
|
|
y="447.45297" |
|
|
width="1100" |
|
|
height="480" |
|
|
rx="10" |
|
|
fill="none" |
|
|
stroke="#388e3c" |
|
|
stroke-width="2" |
|
|
stroke-dasharray="5, 5" |
|
|
id="rect29" /> |
|
|
<text |
|
|
x="70" |
|
|
y="472.45297" |
|
|
class="zone-label" |
|
|
id="text29">Zone 2: MoE Layers (Layers 2-21)</text> |
|
|
<text |
|
|
x="70" |
|
|
y="489.45297" |
|
|
class="param-text" |
|
|
id="text30">Conditional computation • Dynamic expert routing • Budget-aware selection</text> |
|
|
|
|
|
<g |
|
|
id="moe-layer-2" |
|
|
transform="translate(0,-2.5470399)"> |
|
|
<rect |
|
|
x="150" |
|
|
y="510" |
|
|
width="900" |
|
|
height="200" |
|
|
rx="6" |
|
|
fill="#ffffff" |
|
|
stroke="#388e3c" |
|
|
stroke-width="2" |
|
|
id="rect30" /> |
|
|
<text |
|
|
x="600" |
|
|
y="532" |
|
|
text-anchor="middle" |
|
|
class="layer-title" |
|
|
id="text31">Layer 2-21: RoutedBlock (MoE Attention + MoE FFN)</text> |
|
|
|
|
|
<g |
|
|
id="moe-attention"> |
|
|
<rect |
|
|
x="170" |
|
|
y="545" |
|
|
width="400" |
|
|
height="155" |
|
|
rx="4" |
|
|
fill="#e8f5e9" |
|
|
stroke="#66bb6a" |
|
|
stroke-width="1.5" |
|
|
id="rect31" /> |
|
|
<text |
|
|
x="370" |
|
|
y="562" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-weight="bold" |
|
|
id="text32">MoE Attention (6 Experts)</text> |
|
|
|
|
|
<rect |
|
|
x="190" |
|
|
y="575" |
|
|
width="120" |
|
|
height="35" |
|
|
class="router-box" |
|
|
id="rect32" /> |
|
|
<text |
|
|
x="250" |
|
|
y="590" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text33">Attn Router</text> |
|
|
<text |
|
|
x="250" |
|
|
y="602" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text34">top_k: 4-6</text> |
|
|
<line |
|
|
x1="310" |
|
|
y1="592" |
|
|
x2="340" |
|
|
y2="592" |
|
|
class="arrow-line-thin" |
|
|
id="line34" /> |
|
|
|
|
|
<g |
|
|
id="expert-heads"> |
|
|
<rect |
|
|
x="340" |
|
|
y="572" |
|
|
width="60" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect34" /> |
|
|
<text |
|
|
x="370" |
|
|
y="586" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text35">Head 0</text> |
|
|
<rect |
|
|
x="340" |
|
|
y="594" |
|
|
width="60" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect35" /> |
|
|
<text |
|
|
x="370" |
|
|
y="608" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text36">Head 1</text> |
|
|
<rect |
|
|
x="410" |
|
|
y="572" |
|
|
width="60" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect36" /> |
|
|
<text |
|
|
x="440" |
|
|
y="586" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text37">Head 2</text> |
|
|
<rect |
|
|
x="410" |
|
|
y="594" |
|
|
width="60" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect37" /> |
|
|
<text |
|
|
x="440" |
|
|
y="608" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text38">Head 3</text> |
|
|
<rect |
|
|
x="480" |
|
|
y="572" |
|
|
width="60" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect38" /> |
|
|
<text |
|
|
x="510" |
|
|
y="586" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text39">Head 4</text> |
|
|
<rect |
|
|
x="480" |
|
|
y="594" |
|
|
width="60" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect39" /> |
|
|
<text |
|
|
x="510" |
|
|
y="608" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text40">Head 5</text> |
|
|
</g> |
|
|
<text |
|
|
x="370" |
|
|
y="630" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text41">Each: Q/K/V proj + RoPE + SDPA</text> |
|
|
<text |
|
|
x="370" |
|
|
y="642" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text42">Head dim: 64 (1024/16)</text> |
|
|
|
|
|
<rect |
|
|
x="230" |
|
|
y="655" |
|
|
width="280" |
|
|
height="30" |
|
|
class="component-box" |
|
|
id="rect42" /> |
|
|
<text |
|
|
x="370" |
|
|
y="675" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text43">Weighted Combination → Output Proj</text> |
|
|
</g> |
|
|
|
|
|
<g |
|
|
id="moe-ffn"> |
|
|
<rect |
|
|
x="600" |
|
|
y="545" |
|
|
width="430" |
|
|
height="155" |
|
|
rx="4" |
|
|
fill="#e8f5e9" |
|
|
stroke="#66bb6a" |
|
|
stroke-width="1.5" |
|
|
id="rect43" /> |
|
|
<text |
|
|
x="815" |
|
|
y="562" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-weight="bold" |
|
|
id="text44">MoE FFN (4 Experts)</text> |
|
|
|
|
|
<rect |
|
|
x="650" |
|
|
y="575" |
|
|
width="120" |
|
|
height="35" |
|
|
class="router-box" |
|
|
id="rect44" /> |
|
|
<text |
|
|
x="710" |
|
|
y="590" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text45">FFN Router</text> |
|
|
<text |
|
|
x="710" |
|
|
y="602" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text46">top_k: 1-2</text> |
|
|
<line |
|
|
x1="770" |
|
|
y1="592" |
|
|
x2="800" |
|
|
y2="592" |
|
|
class="arrow-line-thin" |
|
|
id="line46" /> |
|
|
|
|
|
<g |
|
|
id="ffn-experts"> |
|
|
<rect |
|
|
x="800" |
|
|
y="572" |
|
|
width="100" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect46" /> |
|
|
<text |
|
|
x="850" |
|
|
y="586" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text47">Expert 0 (FFN)</text> |
|
|
<rect |
|
|
x="800" |
|
|
y="594" |
|
|
width="100" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect47" /> |
|
|
<text |
|
|
x="850" |
|
|
y="608" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text48">Expert 1 (FFN)</text> |
|
|
<rect |
|
|
x="910" |
|
|
y="572" |
|
|
width="100" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect48" /> |
|
|
<text |
|
|
x="960" |
|
|
y="586" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text49">Expert 2 (FFN)</text> |
|
|
<rect |
|
|
x="910" |
|
|
y="594" |
|
|
width="100" |
|
|
height="20" |
|
|
class="expert-box" |
|
|
id="rect49" /> |
|
|
<text |
|
|
x="960" |
|
|
y="608" |
|
|
text-anchor="middle" |
|
|
font-size="9px" |
|
|
fill="#333333" |
|
|
id="text50">Expert 3 (FFN)</text> |
|
|
</g> |
|
|
<text |
|
|
x="815" |
|
|
y="630" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text51">Each: Linear(1024→4096) + GeLU + Linear(4096→1024)</text> |
|
|
<text |
|
|
x="815" |
|
|
y="642" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text52">Token-level routing with STE gating</text> |
|
|
|
|
|
<rect |
|
|
x="680" |
|
|
y="655" |
|
|
width="270" |
|
|
height="30" |
|
|
class="component-box" |
|
|
id="rect52" /> |
|
|
<text |
|
|
x="815" |
|
|
y="675" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text53">Gated Combination + Load Balance</text> |
|
|
</g> |
|
|
</g> |
|
|
<line |
|
|
x1="600" |
|
|
y1="707.45294" |
|
|
x2="600" |
|
|
y2="727.45294" |
|
|
class="arrow-line" |
|
|
id="line53" /> |
|
|
|
|
|
<rect |
|
|
x="400" |
|
|
y="727.45294" |
|
|
width="400" |
|
|
height="80" |
|
|
rx="6" |
|
|
fill="#ffffff" |
|
|
stroke="#388e3c" |
|
|
stroke-width="2" |
|
|
stroke-dasharray="3, 3" |
|
|
id="rect53" /> |
|
|
<text |
|
|
x="600" |
|
|
y="757.45294" |
|
|
text-anchor="middle" |
|
|
class="layer-title" |
|
|
font-size="20" |
|
|
id="text54">⋮</text> |
|
|
<text |
|
|
x="600" |
|
|
y="782.45294" |
|
|
text-anchor="middle" |
|
|
class="layer-subtitle" |
|
|
id="text55">Layers 3-20 (18 more MoE layers)</text> |
|
|
<text |
|
|
x="600" |
|
|
y="797.45294" |
|
|
text-anchor="middle" |
|
|
class="param-text" |
|
|
id="text56">Same structure: MoE Attention (6 experts) + MoE FFN (4 experts)</text> |
|
|
<line |
|
|
x1="600" |
|
|
y1="807.45294" |
|
|
x2="600" |
|
|
y2="857.45294" |
|
|
class="arrow-line" |
|
|
id="line56" /> |
|
|
|
|
|
<rect |
|
|
x="50" |
|
|
y="937.35657" |
|
|
width="1100" |
|
|
height="280" |
|
|
rx="10" |
|
|
fill="url(#mambaEnhGrad)" |
|
|
class="zone-bg" |
|
|
id="rect56" |
|
|
style="fill:url(#mambaEnhGrad)" /> |
|
|
<rect |
|
|
x="50" |
|
|
y="935.65857" |
|
|
width="1100" |
|
|
height="280" |
|
|
rx="10" |
|
|
fill="none" |
|
|
stroke="#0288d1" |
|
|
stroke-width="2" |
|
|
stroke-dasharray="5, 5" |
|
|
id="rect57" /> |
|
|
<text |
|
|
x="70" |
|
|
y="962.35657" |
|
|
class="zone-label" |
|
|
id="text57">Zone 3: Enhanced Mamba (Layers 22-23)</text> |
|
|
<text |
|
|
x="70" |
|
|
y="979.35657" |
|
|
class="param-text" |
|
|
id="text58">High-capacity refinement • d_state=16, expand=4 (2× capacity)</text> |
|
|
|
|
|
<rect |
|
|
x="200" |
|
|
y="997.35657" |
|
|
width="800" |
|
|
height="70" |
|
|
rx="6" |
|
|
fill="#ffffff" |
|
|
stroke="#0288d1" |
|
|
stroke-width="2" |
|
|
id="rect58" /> |
|
|
<text |
|
|
x="600" |
|
|
y="1019.3566" |
|
|
text-anchor="middle" |
|
|
class="layer-title" |
|
|
id="text59">Layer 22: Enhanced MambaBlock</text> |
|
|
<rect |
|
|
x="300" |
|
|
y="1032.3566" |
|
|
width="600" |
|
|
height="25" |
|
|
class="component-box" |
|
|
id="rect59" /> |
|
|
<text |
|
|
x="600" |
|
|
y="1049.3566" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
font-size="11" |
|
|
id="text60">Mamba SSM + FFN (expand=4, 8192 hidden dim)</text> |
|
|
<line |
|
|
x1="600" |
|
|
y1="1067.3566" |
|
|
x2="600" |
|
|
y2="1087.3566" |
|
|
class="arrow-line" |
|
|
id="line60" /> |
|
|
|
|
|
<rect |
|
|
x="300" |
|
|
y="1087.3566" |
|
|
width="600" |
|
|
height="50" |
|
|
rx="6" |
|
|
fill="#ffffff" |
|
|
stroke="#0288d1" |
|
|
stroke-width="2" |
|
|
id="rect60" /> |
|
|
<text |
|
|
x="600" |
|
|
y="1109.3566" |
|
|
text-anchor="middle" |
|
|
class="layer-title" |
|
|
id="text61">Layer 23: Enhanced MambaBlock</text> |
|
|
<text |
|
|
x="600" |
|
|
y="1127.3566" |
|
|
text-anchor="middle" |
|
|
class="layer-subtitle" |
|
|
id="text62">(same structure as Layer 22)</text> |
|
|
<line |
|
|
x1="600" |
|
|
y1="1137.3566" |
|
|
x2="600" |
|
|
y2="1157.3566" |
|
|
class="arrow-line" |
|
|
id="line62" /> |
|
|
|
|
|
<rect |
|
|
x="400" |
|
|
y="1220.4116" |
|
|
width="400" |
|
|
height="40" |
|
|
rx="6" |
|
|
fill="#e0e0e0" |
|
|
stroke="#757575" |
|
|
stroke-width="2" |
|
|
id="rect62" /> |
|
|
<text |
|
|
x="600" |
|
|
y="1245.4116" |
|
|
text-anchor="middle" |
|
|
class="component-text" |
|
|
id="text63">Final LayerNorm → LM Head → Logits</text> |
|
|
|
|
|
<g |
|
|
id="legend"> |
|
|
<rect |
|
|
x="50" |
|
|
y="1270" |
|
|
width="1100" |
|
|
height="110" |
|
|
rx="8" |
|
|
fill="#fff" |
|
|
stroke="#999" |
|
|
stroke-width="1.5" |
|
|
id="rect63" /> |
|
|
<text |
|
|
x="70" |
|
|
y="1290" |
|
|
class="zone-label" |
|
|
font-size="14" |
|
|
id="text64">Legend n' Key Features</text> |
|
|
<circle |
|
|
cx="80" |
|
|
cy="1310" |
|
|
r="6" |
|
|
fill="#1976D2" |
|
|
id="circle64" /> |
|
|
<text |
|
|
x="95" |
|
|
y="1315" |
|
|
class="legend-text" |
|
|
id="text65">Mamba: State Space Model (O(n) complexity, efficient long context)</text> |
|
|
<circle |
|
|
cx="80" |
|
|
cy="1330" |
|
|
r="6" |
|
|
fill="#388E3C" |
|
|
id="circle65" /> |
|
|
<text |
|
|
x="95" |
|
|
y="1335" |
|
|
class="legend-text" |
|
|
id="text66">MoE: Mixture of Experts (conditional computation, sparse activation)</text> |
|
|
<rect |
|
|
x="75" |
|
|
y="1343" |
|
|
width="15" |
|
|
height="12" |
|
|
class="router-box" |
|
|
id="rect66" /> |
|
|
<text |
|
|
x="95" |
|
|
y="1352" |
|
|
class="legend-text" |
|
|
id="text67">Router: Dynamic expert selection based on input content</text> |
|
|
<text |
|
|
x="550" |
|
|
y="1315" |
|
|
class="legend-text" |
|
|
font-weight="bold" |
|
|
id="text68">Budget Ratio: Runtime control parameter (0.0-1.0)</text> |
|
|
<text |
|
|
x="565" |
|
|
y="1330" |
|
|
class="legend-text" |
|
|
id="text69">• Controls active expert count (speed vs. quality tradeoff)</text> |
|
|
<text |
|
|
x="565" |
|
|
y="1345" |
|
|
class="legend-text" |
|
|
id="text70">• Example: 0.5 = 50% fewer experts active → 2× faster inference</text> |
|
|
<text |
|
|
x="565" |
|
|
y="1360" |
|
|
class="legend-text" |
|
|
id="text71">• Auxiliary losses: Load balancing, Router-Z, Entropy regularization</text> |
|
|
</g> |
|
|
</svg> |
|
|
|