ROE_EDU_BASE_Undercooked / ROE_Build.svg
Alienanthony's picture
Upload of model inferencing and svg
9cd89a6 verified
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
width="1200"
height="1400"
font-family="Arial, sans-serif"
version="1.1"
id="svg71"
sodipodi:docname="svgviewer-output.svg"
inkscape:version="1.4.2 (2aeb623e1d, 2025-05-12)"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<sodipodi:namedview
id="namedview71"
pagecolor="#505050"
bordercolor="#eeeeee"
borderopacity="1"
inkscape:showpageshadow="0"
inkscape:pageopacity="0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1"
showgrid="false"
inkscape:zoom="1.1778379"
inkscape:cx="615.11013"
inkscape:cy="400.73427"
inkscape:window-width="2494"
inkscape:window-height="1371"
inkscape:window-x="66"
inkscape:window-y="32"
inkscape:window-maximized="1"
inkscape:current-layer="svg71" />
<defs
id="defs9">
<marker
id="arrow"
markerWidth="10"
markerHeight="7"
refX="9"
refY="3.5"
orient="auto">
<polygon
points="0 0, 10 3.5, 0 7"
fill="#333"
id="polygon1" />
</marker>
<!-- Gradient definitions -->
<linearGradient
id="mambaGrad"
x1="0%"
y1="0%"
x2="0%"
y2="100%">
<stop
offset="0%"
style="stop-color:#E3F2FD;stop-opacity:1"
id="stop1" />
<stop
offset="100%"
style="stop-color:#BBDEFB;stop-opacity:1"
id="stop2" />
</linearGradient>
<linearGradient
id="mambaEnhGrad"
x1="25.226249"
y1="1783.8562"
x2="25.226249"
y2="2338.8337"
gradientTransform="matrix(1.9820624,0,0,0.50452498,0,37.356585)"
gradientUnits="userSpaceOnUse">
<stop
offset="0%"
style="stop-color:#81D4FA;stop-opacity:1"
id="stop3" />
<stop
offset="100%"
style="stop-color:#4FC3F7;stop-opacity:1"
id="stop4" />
</linearGradient>
<linearGradient
id="moeGrad"
x1="33.028913"
y1="681.22133"
x2="33.028913"
y2="1407.8574"
gradientTransform="matrix(1.5138252,0,0,0.66057826,0,-2.5470399)"
gradientUnits="userSpaceOnUse">
<stop
offset="0%"
style="stop-color:#E8F5E9;stop-opacity:1"
id="stop5" />
<stop
offset="100%"
style="stop-color:#C8E6C9;stop-opacity:1"
id="stop6" />
</linearGradient>
<!-- Pattern for experts -->
<pattern
id="expertPattern"
x="0"
y="0"
width="20"
height="20"
patternUnits="userSpaceOnUse">
<circle
cx="5"
cy="5"
r="2"
fill="#4CAF50"
opacity="0.3"
id="circle6" />
<circle
cx="15"
cy="5"
r="2"
fill="#4CAF50"
opacity="0.3"
id="circle7" />
<circle
cx="5"
cy="15"
r="2"
fill="#4CAF50"
opacity="0.3"
id="circle8" />
<circle
cx="15"
cy="15"
r="2"
fill="#4CAF50"
opacity="0.3"
id="circle9" />
</pattern>
</defs>
<style
id="style9">
.zone-label { font-size: 18px; font-weight: bold; fill: #1a1a1a; }
.layer-title { font-size: 15px; font-weight: bold; fill: #1a1a1a; }
.layer-subtitle { font-size: 11px; fill: #555; }
.param-text { font-size: 10px; fill: #666; font-style: italic; }
.component-text { font-size: 12px; fill: #333; font-weight: 500; }
.arrow-line { stroke: #333; stroke-width: 2; fill: none; marker-end: url(#arrow); }
.arrow-line-thin { stroke: #666; stroke-width: 1.5; fill: none; marker-end: url(#arrow); }
.expert-box { fill: #fff; stroke: #4CAF50; stroke-width: 1.5; rx: 3; }
.router-box { fill: #FFF3E0; stroke: #FF9800; stroke-width: 1.5; rx: 3; }
.component-box { fill: #f5f5f5; stroke: #999; stroke-width: 1.5; rx: 4; }
.zone-bg { opacity: 0.15; }
.legend-text { font-size: 11px; fill: #333; }
.title-main { font-size: 24px; font-weight: bold; fill: #1a1a1a; }
.subtitle { font-size: 14px; fill: #666; }
</style>
<!-- Background -->
<rect
width="1200"
height="1421.6123"
fill="#fafafa"
id="rect9"
x="0"
y="0"
style="stroke-width:1.00769" />
<!-- Title -->
<text
x="600"
y="35"
text-anchor="middle"
class="title-main"
id="text9">AdaptiveRiverLM Architecture</text>
<text
x="600"
y="55"
text-anchor="middle"
class="subtitle"
id="text10">Hybrid Mamba-SSM + MoE Transformer (~1B parameters, 24 layers)</text>
<!-- Input -->
<rect
x="450"
y="90"
width="300"
height="40"
rx="20"
fill="#E0E0E0"
stroke="#757575"
stroke-width="2"
id="rect10" />
<text
x="600"
y="115"
text-anchor="middle"
class="component-text"
id="text11">Input Token IDs → Embedding (d=1024)</text>
<line
x1="600"
y1="130"
x2="600"
y2="160"
class="arrow-line"
id="line11" />
<!-- ========== ZONE 1: Early Mamba ========== -->
<rect
x="50"
y="160"
width="1100"
height="280"
rx="10"
fill="url(#mambaGrad)"
class="zone-bg"
id="rect11" />
<rect
x="50"
y="160"
width="1100"
height="280"
rx="10"
fill="none"
stroke="#1976D2"
stroke-width="2"
stroke-dasharray="5,5"
id="rect12" />
<text
x="70"
y="185"
class="zone-label"
id="text12">Zone 1: Early Mamba (Layers 0-1)</text>
<text
x="70"
y="202"
class="param-text"
id="text13">Fast sequence processing • O(n) complexity • d_state=16, expand=2</text>
<!-- Layer 0 Detailed -->
<g
id="mamba-layer-0">
<rect
x="200"
y="220"
width="800"
height="90"
rx="6"
fill="#fff"
stroke="#1976D2"
stroke-width="2"
id="rect13" />
<text
x="600"
y="242"
text-anchor="middle"
class="layer-title"
id="text14">Layer 0: MambaBlock</text>
<!-- Internal components -->
<rect
x="220"
y="255"
width="150"
height="45"
class="component-box"
id="rect14" />
<text
x="295"
y="272"
text-anchor="middle"
class="component-text"
id="text15">LayerNorm</text>
<text
x="295"
y="287"
text-anchor="middle"
class="param-text"
id="text16"></text>
<text
x="295"
y="297"
text-anchor="middle"
class="component-text"
font-size="11"
id="text17">Mamba SSM</text>
<line
x1="370"
y1="277"
x2="410"
y2="277"
class="arrow-line-thin"
id="line17" />
<rect
x="410"
y="255"
width="110"
height="45"
fill="#E3F2FD"
stroke="#1976D2"
stroke-width="1.5"
rx="4"
id="rect17" />
<text
x="465"
y="272"
text-anchor="middle"
class="component-text"
font-size="11"
id="text18">Selective SSM</text>
<text
x="465"
y="285"
text-anchor="middle"
class="param-text"
id="text19">State: 16</text>
<text
x="465"
y="296"
text-anchor="middle"
class="param-text"
id="text20">Conv: 4</text>
<line
x1="520"
y1="277"
x2="560"
y2="277"
class="arrow-line-thin"
id="line20" />
<rect
x="560"
y="255"
width="130"
height="45"
class="component-box"
id="rect20" />
<text
x="625"
y="275"
text-anchor="middle"
class="component-text"
id="text21">Residual +</text>
<text
x="625"
y="290"
text-anchor="middle"
class="param-text"
id="text22">LayerNorm → FFN</text>
<line
x1="690"
y1="277"
x2="730"
y2="277"
class="arrow-line-thin"
id="line22" />
<rect
x="730"
y="255"
width="120"
height="45"
class="component-box"
id="rect22" />
<text
x="790"
y="272"
text-anchor="middle"
class="component-text"
id="text23">GeLU FFN</text>
<text
x="790"
y="287"
text-anchor="middle"
class="param-text"
id="text24">4096 hidden</text>
<text
x="790"
y="297"
text-anchor="middle"
class="param-text"
id="text25">expand=2</text>
<rect
x="870"
y="255"
width="110"
height="45"
class="component-box"
id="rect25" />
<text
x="925"
y="280"
text-anchor="middle"
class="component-text"
id="text26">Residual +</text>
</g>
<line
x1="600"
y1="310"
x2="600"
y2="330"
class="arrow-line"
id="line26" />
<!-- Layer 1 Compact -->
<rect
x="300"
y="330"
width="600"
height="50"
rx="6"
fill="#fff"
stroke="#1976D2"
stroke-width="2"
id="rect26" />
<text
x="600"
y="355"
text-anchor="middle"
class="layer-title"
id="text27">Layer 1: MambaBlock</text>
<text
x="600"
y="370"
text-anchor="middle"
class="layer-subtitle"
id="text28">(same structure as Layer 0)</text>
<line
x1="600"
y1="380"
x2="600"
y2="410"
class="arrow-line"
id="line28" />
<!-- ========== ZONE 2: MoE Middle ========== -->
<rect
x="50"
y="447.45297"
width="1100"
height="480"
rx="10"
fill="url(#moeGrad)"
class="zone-bg"
id="rect28"
style="fill:url(#moeGrad)" />
<rect
x="50"
y="447.45297"
width="1100"
height="480"
rx="10"
fill="none"
stroke="#388e3c"
stroke-width="2"
stroke-dasharray="5, 5"
id="rect29" />
<text
x="70"
y="472.45297"
class="zone-label"
id="text29">Zone 2: MoE Layers (Layers 2-21)</text>
<text
x="70"
y="489.45297"
class="param-text"
id="text30">Conditional computation • Dynamic expert routing • Budget-aware selection</text>
<!-- Detailed MoE Layer (Layer 2) -->
<g
id="moe-layer-2"
transform="translate(0,-2.5470399)">
<rect
x="150"
y="510"
width="900"
height="200"
rx="6"
fill="#ffffff"
stroke="#388e3c"
stroke-width="2"
id="rect30" />
<text
x="600"
y="532"
text-anchor="middle"
class="layer-title"
id="text31">Layer 2-21: RoutedBlock (MoE Attention + MoE FFN)</text>
<!-- MoE Attention Section -->
<g
id="moe-attention">
<rect
x="170"
y="545"
width="400"
height="155"
rx="4"
fill="#e8f5e9"
stroke="#66bb6a"
stroke-width="1.5"
id="rect31" />
<text
x="370"
y="562"
text-anchor="middle"
class="component-text"
font-weight="bold"
id="text32">MoE Attention (6 Experts)</text>
<!-- Router -->
<rect
x="190"
y="575"
width="120"
height="35"
class="router-box"
id="rect32" />
<text
x="250"
y="590"
text-anchor="middle"
class="component-text"
font-size="11"
id="text33">Attn Router</text>
<text
x="250"
y="602"
text-anchor="middle"
class="param-text"
id="text34">top_k: 4-6</text>
<line
x1="310"
y1="592"
x2="340"
y2="592"
class="arrow-line-thin"
id="line34" />
<!-- Experts -->
<g
id="expert-heads">
<rect
x="340"
y="572"
width="60"
height="20"
class="expert-box"
id="rect34" />
<text
x="370"
y="586"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text35">Head 0</text>
<rect
x="340"
y="594"
width="60"
height="20"
class="expert-box"
id="rect35" />
<text
x="370"
y="608"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text36">Head 1</text>
<rect
x="410"
y="572"
width="60"
height="20"
class="expert-box"
id="rect36" />
<text
x="440"
y="586"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text37">Head 2</text>
<rect
x="410"
y="594"
width="60"
height="20"
class="expert-box"
id="rect37" />
<text
x="440"
y="608"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text38">Head 3</text>
<rect
x="480"
y="572"
width="60"
height="20"
class="expert-box"
id="rect38" />
<text
x="510"
y="586"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text39">Head 4</text>
<rect
x="480"
y="594"
width="60"
height="20"
class="expert-box"
id="rect39" />
<text
x="510"
y="608"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text40">Head 5</text>
</g>
<text
x="370"
y="630"
text-anchor="middle"
class="param-text"
id="text41">Each: Q/K/V proj + RoPE + SDPA</text>
<text
x="370"
y="642"
text-anchor="middle"
class="param-text"
id="text42">Head dim: 64 (1024/16)</text>
<!-- Weighted combine -->
<rect
x="230"
y="655"
width="280"
height="30"
class="component-box"
id="rect42" />
<text
x="370"
y="675"
text-anchor="middle"
class="component-text"
font-size="11"
id="text43">Weighted Combination → Output Proj</text>
</g>
<!-- MoE FFN Section -->
<g
id="moe-ffn">
<rect
x="600"
y="545"
width="430"
height="155"
rx="4"
fill="#e8f5e9"
stroke="#66bb6a"
stroke-width="1.5"
id="rect43" />
<text
x="815"
y="562"
text-anchor="middle"
class="component-text"
font-weight="bold"
id="text44">MoE FFN (4 Experts)</text>
<!-- Router -->
<rect
x="650"
y="575"
width="120"
height="35"
class="router-box"
id="rect44" />
<text
x="710"
y="590"
text-anchor="middle"
class="component-text"
font-size="11"
id="text45">FFN Router</text>
<text
x="710"
y="602"
text-anchor="middle"
class="param-text"
id="text46">top_k: 1-2</text>
<line
x1="770"
y1="592"
x2="800"
y2="592"
class="arrow-line-thin"
id="line46" />
<!-- Experts -->
<g
id="ffn-experts">
<rect
x="800"
y="572"
width="100"
height="20"
class="expert-box"
id="rect46" />
<text
x="850"
y="586"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text47">Expert 0 (FFN)</text>
<rect
x="800"
y="594"
width="100"
height="20"
class="expert-box"
id="rect47" />
<text
x="850"
y="608"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text48">Expert 1 (FFN)</text>
<rect
x="910"
y="572"
width="100"
height="20"
class="expert-box"
id="rect48" />
<text
x="960"
y="586"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text49">Expert 2 (FFN)</text>
<rect
x="910"
y="594"
width="100"
height="20"
class="expert-box"
id="rect49" />
<text
x="960"
y="608"
text-anchor="middle"
font-size="9px"
fill="#333333"
id="text50">Expert 3 (FFN)</text>
</g>
<text
x="815"
y="630"
text-anchor="middle"
class="param-text"
id="text51">Each: Linear(1024→4096) + GeLU + Linear(4096→1024)</text>
<text
x="815"
y="642"
text-anchor="middle"
class="param-text"
id="text52">Token-level routing with STE gating</text>
<!-- Output -->
<rect
x="680"
y="655"
width="270"
height="30"
class="component-box"
id="rect52" />
<text
x="815"
y="675"
text-anchor="middle"
class="component-text"
font-size="11"
id="text53">Gated Combination + Load Balance</text>
</g>
</g>
<line
x1="600"
y1="707.45294"
x2="600"
y2="727.45294"
class="arrow-line"
id="line53" />
<!-- Compact representation of layers 3-21 -->
<rect
x="400"
y="727.45294"
width="400"
height="80"
rx="6"
fill="#ffffff"
stroke="#388e3c"
stroke-width="2"
stroke-dasharray="3, 3"
id="rect53" />
<text
x="600"
y="757.45294"
text-anchor="middle"
class="layer-title"
font-size="20"
id="text54"></text>
<text
x="600"
y="782.45294"
text-anchor="middle"
class="layer-subtitle"
id="text55">Layers 3-20 (18 more MoE layers)</text>
<text
x="600"
y="797.45294"
text-anchor="middle"
class="param-text"
id="text56">Same structure: MoE Attention (6 experts) + MoE FFN (4 experts)</text>
<line
x1="600"
y1="807.45294"
x2="600"
y2="857.45294"
class="arrow-line"
id="line56" />
<!-- ========== ZONE 3: Enhanced Mamba ========== -->
<rect
x="50"
y="937.35657"
width="1100"
height="280"
rx="10"
fill="url(#mambaEnhGrad)"
class="zone-bg"
id="rect56"
style="fill:url(#mambaEnhGrad)" />
<rect
x="50"
y="935.65857"
width="1100"
height="280"
rx="10"
fill="none"
stroke="#0288d1"
stroke-width="2"
stroke-dasharray="5, 5"
id="rect57" />
<text
x="70"
y="962.35657"
class="zone-label"
id="text57">Zone 3: Enhanced Mamba (Layers 22-23)</text>
<text
x="70"
y="979.35657"
class="param-text"
id="text58">High-capacity refinement • d_state=16, expand=4 (2× capacity)</text>
<!-- Layer 22 -->
<rect
x="200"
y="997.35657"
width="800"
height="70"
rx="6"
fill="#ffffff"
stroke="#0288d1"
stroke-width="2"
id="rect58" />
<text
x="600"
y="1019.3566"
text-anchor="middle"
class="layer-title"
id="text59">Layer 22: Enhanced MambaBlock</text>
<rect
x="300"
y="1032.3566"
width="600"
height="25"
class="component-box"
id="rect59" />
<text
x="600"
y="1049.3566"
text-anchor="middle"
class="component-text"
font-size="11"
id="text60">Mamba SSM + FFN (expand=4, 8192 hidden dim)</text>
<line
x1="600"
y1="1067.3566"
x2="600"
y2="1087.3566"
class="arrow-line"
id="line60" />
<!-- Layer 23 -->
<rect
x="300"
y="1087.3566"
width="600"
height="50"
rx="6"
fill="#ffffff"
stroke="#0288d1"
stroke-width="2"
id="rect60" />
<text
x="600"
y="1109.3566"
text-anchor="middle"
class="layer-title"
id="text61">Layer 23: Enhanced MambaBlock</text>
<text
x="600"
y="1127.3566"
text-anchor="middle"
class="layer-subtitle"
id="text62">(same structure as Layer 22)</text>
<line
x1="600"
y1="1137.3566"
x2="600"
y2="1157.3566"
class="arrow-line"
id="line62" />
<!-- Output layers -->
<rect
x="400"
y="1220.4116"
width="400"
height="40"
rx="6"
fill="#e0e0e0"
stroke="#757575"
stroke-width="2"
id="rect62" />
<text
x="600"
y="1245.4116"
text-anchor="middle"
class="component-text"
id="text63">Final LayerNorm → LM Head → Logits</text>
<!-- Legend -->
<g
id="legend">
<rect
x="50"
y="1270"
width="1100"
height="110"
rx="8"
fill="#fff"
stroke="#999"
stroke-width="1.5"
id="rect63" />
<text
x="70"
y="1290"
class="zone-label"
font-size="14"
id="text64">Legend n' Key Features</text>
<circle
cx="80"
cy="1310"
r="6"
fill="#1976D2"
id="circle64" />
<text
x="95"
y="1315"
class="legend-text"
id="text65">Mamba: State Space Model (O(n) complexity, efficient long context)</text>
<circle
cx="80"
cy="1330"
r="6"
fill="#388E3C"
id="circle65" />
<text
x="95"
y="1335"
class="legend-text"
id="text66">MoE: Mixture of Experts (conditional computation, sparse activation)</text>
<rect
x="75"
y="1343"
width="15"
height="12"
class="router-box"
id="rect66" />
<text
x="95"
y="1352"
class="legend-text"
id="text67">Router: Dynamic expert selection based on input content</text>
<text
x="550"
y="1315"
class="legend-text"
font-weight="bold"
id="text68">Budget Ratio: Runtime control parameter (0.0-1.0)</text>
<text
x="565"
y="1330"
class="legend-text"
id="text69">• Controls active expert count (speed vs. quality tradeoff)</text>
<text
x="565"
y="1345"
class="legend-text"
id="text70">• Example: 0.5 = 50% fewer experts active → 2× faster inference</text>
<text
x="565"
y="1360"
class="legend-text"
id="text71">• Auxiliary losses: Load balancing, Router-Z, Entropy regularization</text>
</g>
</svg>