Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Null Space Projection - Interactive Demo</title> | |
| <link rel="icon" type="image/svg+xml" href="data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 32 32'><rect fill='%231a1a2e' width='32' height='32' rx='4'/><line x1='4' y1='28' x2='28' y2='4' stroke='%234caf50' stroke-width='2'/><line x1='4' y1='16' x2='20' y2='8' stroke='%23f858fb' stroke-width='2.5'/><circle cx='20' cy='8' r='2' fill='%23f858fb'/><line x1='4' y1='16' x2='16' y2='16' stroke='%2300d4ff' stroke-width='2.5'/><circle cx='16' cy='16' r='2' fill='%2300d4ff'/><line x1='20' y1='8' x2='16' y2='16' stroke='white' stroke-width='1' stroke-dasharray='2'/></svg>"> | |
| <style> | |
| * { | |
| box-sizing: border-box; | |
| margin: 0; | |
| padding: 0; | |
| } | |
| body { | |
| font-family: 'Segoe UI', system-ui, sans-serif; | |
| background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%); | |
| color: #e0e0e0; | |
| min-height: 100vh; | |
| padding: 20px; | |
| } | |
| .container { | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| } | |
| h1 { | |
| text-align: center; | |
| color: #00d4ff; | |
| margin-bottom: 10px; | |
| font-size: 2.2em; | |
| text-shadow: 0 0 20px rgba(0, 212, 255, 0.3); | |
| } | |
| .subtitle { | |
| text-align: center; | |
| color: #888; | |
| margin-bottom: 30px; | |
| font-size: 1.1em; | |
| } | |
| .main-grid { | |
| display: grid; | |
| grid-template-columns: 1fr 1fr; | |
| gap: 20px; | |
| margin-bottom: 20px; | |
| } | |
| .panel { | |
| background: rgba(255, 255, 255, 0.05); | |
| border-radius: 12px; | |
| padding: 20px; | |
| border: 1px solid rgba(255, 255, 255, 0.1); | |
| } | |
| .panel h2 { | |
| color: #00d4ff; | |
| margin-bottom: 15px; | |
| font-size: 1.3em; | |
| border-bottom: 1px solid rgba(0, 212, 255, 0.3); | |
| padding-bottom: 10px; | |
| } | |
| .panel h3 { | |
| color: #f858fb; | |
| margin: 15px 0 10px 0; | |
| font-size: 1.1em; | |
| } | |
| .canvas-container { | |
| background: rgba(0, 0, 0, 0.3); | |
| border-radius: 8px; | |
| padding: 10px; | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| } | |
| canvas { | |
| border-radius: 4px; | |
| } | |
| .controls { | |
| display: grid; | |
| gap: 15px; | |
| } | |
| .control-group { | |
| background: rgba(0, 0, 0, 0.2); | |
| padding: 15px; | |
| border-radius: 8px; | |
| } | |
| .control-group label { | |
| display: block; | |
| margin-bottom: 8px; | |
| color: #aaa; | |
| font-size: 0.9em; | |
| } | |
| .slider-row { | |
| display: flex; | |
| align-items: center; | |
| gap: 10px; | |
| margin-bottom: 10px; | |
| } | |
| .slider-row span { | |
| min-width: 80px; | |
| color: #00d4ff; | |
| } | |
| .slider-row input[type="range"] { | |
| flex: 1; | |
| height: 6px; | |
| border-radius: 3px; | |
| background: #333; | |
| outline: none; | |
| -webkit-appearance: none; | |
| } | |
| .slider-row input[type="range"]::-webkit-slider-thumb { | |
| -webkit-appearance: none; | |
| width: 16px; | |
| height: 16px; | |
| border-radius: 50%; | |
| background: #00d4ff; | |
| cursor: pointer; | |
| } | |
| .slider-row .value { | |
| min-width: 50px; | |
| text-align: right; | |
| font-family: monospace; | |
| color: #fff; | |
| } | |
| button { | |
| background: linear-gradient(135deg, #00d4ff 0%, #0099cc 100%); | |
| color: #000; | |
| border: none; | |
| padding: 12px 24px; | |
| border-radius: 6px; | |
| font-size: 1em; | |
| font-weight: bold; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| width: 100%; | |
| margin-top: 10px; | |
| } | |
| button:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 5px 20px rgba(0, 212, 255, 0.4); | |
| } | |
| button.secondary { | |
| background: linear-gradient(135deg, #f858fb 0%, #ee00ff 100%); | |
| } | |
| .legend { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 15px; | |
| margin-top: 15px; | |
| padding: 10px; | |
| background: rgba(0, 0, 0, 0.2); | |
| border-radius: 6px; | |
| } | |
| .legend-item { | |
| display: flex; | |
| align-items: center; | |
| gap: 8px; | |
| font-size: 0.85em; | |
| } | |
| .legend-color { | |
| width: 20px; | |
| height: 4px; | |
| border-radius: 2px; | |
| } | |
| .explanation { | |
| background: rgba(0, 212, 255, 0.1); | |
| border-left: 3px solid #00d4ff; | |
| padding: 15px; | |
| margin: 15px 0; | |
| border-radius: 0 8px 8px 0; | |
| font-size: 0.95em; | |
| line-height: 1.6; | |
| } | |
| .math { | |
| font-family: 'Cambria Math', 'Times New Roman', serif; | |
| background: rgba(0, 0, 0, 0.3); | |
| padding: 10px 15px; | |
| border-radius: 6px; | |
| margin: 10px 0; | |
| overflow-x: auto; | |
| font-size: 1.1em; | |
| } | |
| .math-inline { | |
| font-family: 'Cambria Math', 'Times New Roman', serif; | |
| color: #f858fb; | |
| } | |
| .results-grid { | |
| display: grid; | |
| grid-template-columns: repeat(3, 1fr); | |
| gap: 10px; | |
| margin-top: 15px; | |
| } | |
| .result-box { | |
| background: rgba(0, 0, 0, 0.3); | |
| padding: 12px; | |
| border-radius: 6px; | |
| text-align: center; | |
| } | |
| .result-box .label { | |
| font-size: 0.8em; | |
| color: #888; | |
| margin-bottom: 5px; | |
| } | |
| .result-box .value { | |
| font-family: monospace; | |
| font-size: 1.1em; | |
| color: #00d4ff; | |
| } | |
| .step-indicator { | |
| display: flex; | |
| justify-content: center; | |
| gap: 10px; | |
| margin: 20px 0; | |
| } | |
| .step { | |
| width: 40px; | |
| height: 40px; | |
| border-radius: 50%; | |
| background: rgba(255, 255, 255, 0.1); | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-weight: bold; | |
| cursor: pointer; | |
| transition: all 0.3s ease; | |
| } | |
| .step.active { | |
| background: #00d4ff; | |
| color: #000; | |
| box-shadow: 0 0 20px rgba(0, 212, 255, 0.5); | |
| } | |
| .step.completed { | |
| background: #4caf50; | |
| color: #fff; | |
| } | |
| .full-width { | |
| grid-column: 1 / -1; | |
| } | |
| .code-block { | |
| background: #0d1117; | |
| border-radius: 6px; | |
| padding: 15px; | |
| font-family: 'Consolas', 'Monaco', monospace; | |
| font-size: 0.85em; | |
| overflow-x: auto; | |
| margin: 10px 0; | |
| white-space: pre; | |
| line-height: 1.5; | |
| } | |
| .code-block .comment { | |
| color: #6a737d; | |
| } | |
| .code-block .keyword { | |
| color: #ff7b72; | |
| } | |
| .code-block .function { | |
| color: #d2a8ff; | |
| } | |
| .code-block .string { | |
| color: #a5d6ff; | |
| } | |
| .code-block .number { | |
| color: #79c0ff; | |
| } | |
| .tabs { | |
| display: flex; | |
| gap: 5px; | |
| margin-bottom: 15px; | |
| } | |
| .tab { | |
| padding: 10px 20px; | |
| background: rgba(255, 255, 255, 0.05); | |
| border: none; | |
| color: #888; | |
| cursor: pointer; | |
| border-radius: 6px 6px 0 0; | |
| transition: all 0.3s ease; | |
| } | |
| .tab.active { | |
| background: rgba(0, 212, 255, 0.2); | |
| color: #00d4ff; | |
| } | |
| .tab-content { | |
| display: none; | |
| } | |
| .tab-content.active { | |
| display: block; | |
| } | |
| .highlight { | |
| color: #ffd700; | |
| font-weight: bold; | |
| } | |
| .math-breakdown-bar { | |
| grid-column: 1 / -1; | |
| background: rgba(0, 0, 0, 0.4); | |
| border-radius: 8px; | |
| padding: 20px; | |
| border: 1px solid rgba(255, 255, 255, 0.1); | |
| } | |
| .math-breakdown-bar .vectors-row { | |
| display: flex; | |
| justify-content: center; | |
| gap: 50px; | |
| font-family: 'Consolas', 'Monaco', monospace; | |
| font-size: 1.1em; | |
| margin-bottom: 15px; | |
| padding-bottom: 15px; | |
| border-bottom: 1px solid rgba(255, 255, 255, 0.1); | |
| } | |
| .math-breakdown-bar .vector-item { | |
| display: flex; | |
| align-items: center; | |
| gap: 10px; | |
| } | |
| .math-breakdown-bar .vector-label { | |
| font-weight: bold; | |
| } | |
| .math-breakdown-bar .vector-value { | |
| color: #fff; | |
| background: rgba(255, 255, 255, 0.05); | |
| padding: 4px 12px; | |
| border-radius: 4px; | |
| } | |
| .math-breakdown-bar .calculations-row { | |
| display: grid; | |
| grid-template-columns: 1fr 1fr 1fr; | |
| gap: 20px; | |
| font-family: 'Consolas', 'Monaco', monospace; | |
| font-size: 0.85em; | |
| line-height: 1.7; | |
| } | |
| .math-breakdown-bar .calc-section { | |
| background: rgba(0, 0, 0, 0.3); | |
| padding: 12px 15px; | |
| border-radius: 6px; | |
| } | |
| .math-breakdown-bar .section-label { | |
| color: #888; | |
| font-size: 0.8em; | |
| margin-bottom: 8px; | |
| text-transform: uppercase; | |
| letter-spacing: 1px; | |
| } | |
| .math-breakdown-bar .k-color { color: #4caf50; } | |
| .math-breakdown-bar .dw-color { color: #f858fb; } | |
| .math-breakdown-bar .dwp-color { color: #00d4ff; } | |
| .math-breakdown-bar .calc-intermediate { color: #ffd700; } | |
| .math-breakdown-bar .calc-result { color: #fff; font-weight: bold; } | |
| .math-breakdown-bar .verification-pass { color: #4caf50; font-weight: bold; } | |
| .math-breakdown-bar .verification-fail { color: #f858fb; font-weight: bold; } | |
| @media (max-width: 1000px) { | |
| .main-grid { | |
| grid-template-columns: 1fr; | |
| } | |
| .math-breakdown-bar .vectors-row { | |
| flex-wrap: wrap; | |
| gap: 15px; | |
| } | |
| .math-breakdown-bar .calculations-row { | |
| grid-template-columns: 1fr; | |
| } | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="container"> | |
| <h1>Null Space Projection</h1> | |
| <p class="subtitle">Interactive visualization of how null space constraints preserve capabilities during model modification</p> | |
| <div class="step-indicator"> | |
| <div class="step active" data-step="1" onclick="setStep(1)">1</div> | |
| <div class="step" data-step="2" onclick="setStep(2)">2</div> | |
| <div class="step" data-step="3" onclick="setStep(3)">3</div> | |
| </div> | |
| <div class="main-grid"> | |
| <!-- Left Panel: Visualization --> | |
| <div class="panel"> | |
| <h2>2D Vector Space Visualization</h2> | |
| <div class="canvas-container"> | |
| <canvas id="mainCanvas" width="500" height="500"></canvas> | |
| </div> | |
| <div class="legend"> | |
| <div class="legend-item"> | |
| <div class="legend-color" style="background: #4caf50;"></div> | |
| <span>Preservation Vector (K)</span> | |
| </div> | |
| <div class="legend-item"> | |
| <div class="legend-color" style="background: #f858fb;"></div> | |
| <span>Original Update (ΔW)</span> | |
| </div> | |
| <div class="legend-item"> | |
| <div class="legend-color" style="background: #00d4ff;"></div> | |
| <span>Projected Update (ΔW')</span> | |
| </div> | |
| <div class="legend-item"> | |
| <div class="legend-color" style="background: rgba(76, 175, 80, 0.2);"></div> | |
| <span>Row Space of K</span> | |
| </div> | |
| <div class="legend-item"> | |
| <div class="legend-color" style="background: rgba(0, 212, 255, 0.2);"></div> | |
| <span>Null Space of K</span> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Right Panel: Controls & Explanation --> | |
| <div class="panel"> | |
| <h2>Interactive Controls</h2> | |
| <div class="controls"> | |
| <div class="control-group"> | |
| <label>Preservation Vector K (what we want to preserve)</label> | |
| <div class="slider-row"> | |
| <span>K<sub>x</sub></span> | |
| <input type="range" id="kx" min="-100" max="100" value="80"> | |
| <span class="value" id="kx-val">0.80</span> | |
| </div> | |
| <div class="slider-row"> | |
| <span>K<sub>y</sub></span> | |
| <input type="range" id="ky" min="-100" max="100" value="30"> | |
| <span class="value" id="ky-val">0.30</span> | |
| </div> | |
| </div> | |
| <div class="control-group"> | |
| <label>Original Update ΔW (modification we want to apply)</label> | |
| <div class="slider-row"> | |
| <span>ΔW<sub>x</sub></span> | |
| <input type="range" id="dwx" min="-100" max="100" value="60"> | |
| <span class="value" id="dwx-val">0.60</span> | |
| </div> | |
| <div class="slider-row"> | |
| <span>ΔW<sub>y</sub></span> | |
| <input type="range" id="dwy" min="-100" max="100" value="70"> | |
| <span class="value" id="dwy-val">0.70</span> | |
| </div> | |
| </div> | |
| <button onclick="animateProjection()">Animate Projection</button> | |
| <button class="secondary" onclick="randomize()">Randomize Vectors</button> | |
| </div> | |
| <div class="results-grid"> | |
| <div class="result-box"> | |
| <div class="label">K · ΔW (before)</div> | |
| <div class="value" id="dot-before">-</div> | |
| </div> | |
| <div class="result-box"> | |
| <div class="label">K · ΔW' (after)</div> | |
| <div class="value" id="dot-after">-</div> | |
| </div> | |
| <div class="result-box"> | |
| <div class="label">|ΔW'| / |ΔW|</div> | |
| <div class="value" id="magnitude-ratio">-</div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Full-width Math Breakdown --> | |
| <div class="math-breakdown-bar"> | |
| <!-- Vector Coordinates Row --> | |
| <div class="vectors-row"> | |
| <div class="vector-item"> | |
| <span class="vector-label k-color">K</span> | |
| <span class="vector-value">[<span id="bar-kx">0.80</span>, <span id="bar-ky">0.30</span>]</span> | |
| </div> | |
| <div class="vector-item"> | |
| <span class="vector-label dw-color">ΔW</span> | |
| <span class="vector-value">[<span id="bar-dwx">0.60</span>, <span id="bar-dwy">0.70</span>]</span> | |
| </div> | |
| <div class="vector-item"> | |
| <span class="vector-label dwp-color">ΔW'</span> | |
| <span class="vector-value">[<span id="bar-dwpx">-0.16</span>, <span id="bar-dwpy">0.42</span>]</span> | |
| </div> | |
| </div> | |
| <!-- Calculations Row (3 columns) --> | |
| <div class="calculations-row"> | |
| <!-- Dot Product --> | |
| <div class="calc-section"> | |
| <div class="section-label">Dot Product (K · ΔW)</div> | |
| <div>= (<span class="k-color" id="dot-kx">0.80</span> × <span class="dw-color" id="dot-dwx">0.60</span>) + (<span class="k-color" id="dot-ky">0.30</span> × <span class="dw-color" id="dot-dwy">0.70</span>)</div> | |
| <div>= <span class="calc-intermediate" id="dot-term1">0.48</span> + <span class="calc-intermediate" id="dot-term2">0.21</span></div> | |
| <div>= <span class="calc-result" id="dot-result">0.69</span></div> | |
| </div> | |
| <!-- Projection Formula --> | |
| <div class="calc-section"> | |
| <div class="section-label">Projection Formula</div> | |
| <div><span class="dwp-color">ΔW'</span> = <span class="dw-color">ΔW</span> − [(K·ΔW) / |K|²] × <span class="k-color">K</span></div> | |
| <div style="color:#666">|K|² = <span id="k-mag-sq">0.73</span>, scale = <span id="proj-scale">0.95</span></div> | |
| <div>= [<span class="dw-color" id="proj-dwx">0.60</span>, <span class="dw-color" id="proj-dwy">0.70</span>] − [<span class="calc-intermediate" id="proj-subx">0.76</span>, <span class="calc-intermediate" id="proj-suby">0.28</span>]</div> | |
| <div>= <span class="dwp-color">[<span id="math-dwpx">-0.16</span>, <span id="math-dwpy">0.42</span>]</span></div> | |
| </div> | |
| <!-- Verification --> | |
| <div class="calc-section"> | |
| <div class="section-label">Verification (K · ΔW')</div> | |
| <div>= (<span class="k-color" id="ver-kx">0.80</span> × <span class="dwp-color" id="ver-dwpx">-0.16</span>) + (<span class="k-color" id="ver-ky">0.30</span> × <span class="dwp-color" id="ver-dwpy">0.42</span>)</div> | |
| <div>= <span class="calc-intermediate" id="ver-term1">-0.128</span> + <span class="calc-intermediate" id="ver-term2">0.126</span></div> | |
| <div>= <span id="ver-result" class="verification-pass">≈ 0 ✓</span></div> | |
| </div> | |
| </div> | |
| </div> | |
| <!-- Step-by-step explanation --> | |
| <div class="panel full-width"> | |
| <div class="tabs"> | |
| <button class="tab active" onclick="showTab('concept')">Concept</button> | |
| <button class="tab" onclick="showTab('math')">Math</button> | |
| <button class="tab" onclick="showTab('code')">Code</button> | |
| <button class="tab" onclick="showTab('application')">ML Application</button> | |
| </div> | |
| <div id="concept" class="tab-content active"> | |
| <div id="step1-content" class="step-content active"> | |
| <h3>Step 1: The Problem</h3> | |
| <div class="explanation"> | |
| <p><strong>Goal:</strong> We want to modify a model's weights to remove unwanted behavior (like refusal), | |
| but we don't want to break its useful capabilities (like math, coding, reasoning).</p> | |
| <p style="margin-top: 10px;"><strong>The Challenge:</strong> A naive weight modification <span class="math-inline">ΔW</span> | |
| might accidentally affect the outputs for inputs we care about preserving.</p> | |
| <p style="margin-top: 10px;"><strong>Solution:</strong> Project <span class="math-inline">ΔW</span> into the | |
| <span class="highlight">null space</span> of the preservation activations, ensuring the modification | |
| has <em>zero effect</em> on preserved capabilities.</p> | |
| </div> | |
| </div> | |
| <div id="step2-content" class="step-content"> | |
| <h3>Step 2: Understanding Null Space</h3> | |
| <div class="explanation"> | |
| <p>The <span class="highlight">null space</span> of a matrix <span class="math-inline">K</span> contains | |
| all vectors <span class="math-inline">x</span> where <span class="math-inline">Kx = 0</span>.</p> | |
| <p style="margin-top: 10px;">In 2D visualization:</p> | |
| <ul style="margin-left: 20px; margin-top: 5px;"> | |
| <li>The <span style="color: #4caf50;">green line</span> shows the "row space" of K (directions that K responds to)</li> | |
| <li>The <span style="color: #00d4ff;">blue region</span> shows the "null space" (perpendicular to K)</li> | |
| <li>Any vector in the null space, when multiplied by K, gives zero!</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <div id="step3-content" class="step-content"> | |
| <h3>Step 3: Projection & Result</h3> | |
| <div class="explanation"> | |
| <p>We decompose the original update <span class="math-inline">ΔW</span> into two parts:</p> | |
| <ul style="margin-left: 20px; margin-top: 5px;"> | |
| <li><strong>Row space component:</strong> Part that affects preservation inputs (we <em>remove</em> this)</li> | |
| <li><strong>Null space component:</strong> Part with zero effect on preservation (we <em>keep</em> this)</li> | |
| </ul> | |
| <p style="margin-top: 10px;">The projected update <span class="math-inline">ΔW'</span> = <span class="math-inline">ΔW</span> minus its row-space component.</p> | |
| <p style="margin-top: 10px;"><strong>Result:</strong></p> | |
| <ul style="margin-left: 20px; margin-top: 5px;"> | |
| <li><span style="color: #4caf50;">Preservation guaranteed:</span> <span class="math-inline">K · ΔW' = 0</span></li> | |
| <li><span style="color: #f858fb;">Some modification lost:</span> <span class="math-inline">|ΔW'| ≤ |ΔW|</span></li> | |
| </ul> | |
| <p style="margin-top: 10px;"><strong>Trade-off:</strong> The more aligned your update is with preservation directions, the more gets removed. | |
| In practice, refusal behavior often lives in directions somewhat orthogonal to general capabilities, so we can remove most of it while preserving capabilities!</p> | |
| </div> | |
| </div> | |
| </div> | |
| <div id="math" class="tab-content"> | |
| <h3>Mathematical Formulation</h3> | |
| <div class="explanation"> | |
| <p>Given preservation activations <span class="math-inline">K ∈ ℝ<sup>n×d</sup></span> (n samples, d dimensions):</p> | |
| </div> | |
| <div class="math"> | |
| <strong>1. Compute SVD of K:</strong><br> | |
| K = UΣV<sup>T</sup> | |
| </div> | |
| <div class="math"> | |
| <strong>2. Build null space projector:</strong><br> | |
| P<sub>null</sub> = I - V<sub>r</sub>V<sub>r</sub><sup>T</sup><br> | |
| <span style="color: #888; font-size: 0.9em;">(where V<sub>r</sub> contains the r significant right singular vectors)</span> | |
| </div> | |
| <div class="math"> | |
| <strong>3. Project update into null space:</strong><br> | |
| ΔW' = ΔW · P<sub>null</sub> | |
| </div> | |
| <div class="math"> | |
| <strong>4. Verify preservation:</strong><br> | |
| K · ΔW' = K · ΔW · (I - VV<sup>T</sup>) = K · ΔW - K · ΔW · VV<sup>T</sup> ≈ 0 | |
| </div> | |
| <div class="explanation" style="margin-top: 20px;"> | |
| <p><strong>Why it works:</strong> The rows of V span the row space of K. Subtracting <span class="math-inline">VV<sup>T</sup></span> | |
| removes all components in the row space, leaving only the null space.</p> | |
| </div> | |
| </div> | |
| <div id="code" class="tab-content"> | |
| <h3>Implementation</h3> | |
| <div class="code-block"> | |
| <span class="keyword">import</span> torch | |
| <span class="keyword">def</span> <span class="function">compute_null_space_projector</span>(K): | |
| <span class="string">""" | |
| Compute projector onto null space of K. | |
| Args: | |
| K: Preservation activations [n_samples, d_model] | |
| Returns: | |
| P_null: Projector matrix [d_model, d_model] | |
| """</span> | |
| <span class="comment"># Compute SVD (no centering - we want exact null space of K)</span> | |
| U, S, Vh = torch.linalg.svd(K, full_matrices=<span class="keyword">False</span>) | |
| <span class="comment"># Use all right singular vectors (rows of Vh) as row space basis</span> | |
| <span class="comment"># V_r columns span the row space of K</span> | |
| V_r = Vh.T <span class="comment"># [d_model, rank] where rank = min(n_samples, d_model)</span> | |
| <span class="comment"># Null space projector: I - V_r @ V_r.T</span> | |
| <span class="comment"># Projects onto orthogonal complement of row space</span> | |
| P_null = torch.eye(K.shape[<span class="number">1</span>], device=K.device, dtype=K.dtype) - V_r @ V_r.T | |
| <span class="keyword">return</span> P_null | |
| <span class="keyword">def</span> <span class="function">apply_null_space_projection</span>(delta_W, P_null): | |
| <span class="string">"""Project weight update into null space."""</span> | |
| <span class="keyword">return</span> delta_W @ P_null | |
| <span class="comment"># ===========================================</span> | |
| <span class="comment"># Runnable example with synthetic data:</span> | |
| <span class="comment"># ===========================================</span> | |
| <span class="comment"># Simulate preservation activations (e.g., from math/coding prompts)</span> | |
| <span class="comment"># Shape: [n_samples, hidden_dim]</span> | |
| n_samples, hidden_dim = <span class="number">50</span>, <span class="number">128</span> | |
| K = torch.randn(n_samples, hidden_dim) | |
| <span class="comment"># Compute the null space projector</span> | |
| P_null = compute_null_space_projector(K) | |
| <span class="comment"># Simulate a weight update we want to apply (e.g., refusal direction)</span> | |
| delta_W = torch.randn(hidden_dim) | |
| <span class="comment"># Project into null space (safe update)</span> | |
| delta_W_safe = apply_null_space_projection(delta_W, P_null) | |
| <span class="comment"># Verify: K @ delta_W_safe should be ~0 for all samples</span> | |
| effect_on_preservation = K @ delta_W_safe | |
| print(<span class="string">f"Max effect on preservation inputs: {effect_on_preservation.abs().max():.2e}"</span>) | |
| <span class="comment"># Output: Max effect on preservation inputs: ~1e-6 (effectively zero!)</span> | |
| </div> | |
| </div> | |
| <div id="application" class="tab-content"> | |
| <h3>Application to Model Abliteration</h3> | |
| <div class="explanation"> | |
| <p>In the context of removing refusal behavior from language models:</p> | |
| </div> | |
| <table style="width: 100%; margin: 15px 0; border-collapse: collapse;"> | |
| <tr style="background: rgba(0,0,0,0.3);"> | |
| <th style="padding: 12px; text-align: left; border-bottom: 1px solid #333;">Component</th> | |
| <th style="padding: 12px; text-align: left; border-bottom: 1px solid #333;">In Demo</th> | |
| <th style="padding: 12px; text-align: left; border-bottom: 1px solid #333;">In Abliteration</th> | |
| </tr> | |
| <tr> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">K</td> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">Preservation vector</td> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">Activations from math, coding, reasoning prompts</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">ΔW</td> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">Original update</td> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">Refusal direction projection</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">ΔW'</td> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">Projected update</td> | |
| <td style="padding: 12px; border-bottom: 1px solid #222;">Safe refusal removal (preserves capabilities)</td> | |
| </tr> | |
| <tr> | |
| <td style="padding: 12px;">K · ΔW' = 0</td> | |
| <td style="padding: 12px;">Dot product is zero</td> | |
| <td style="padding: 12px;">Math/coding outputs unchanged</td> | |
| </tr> | |
| </table> | |
| <div class="explanation" style="margin-top: 15px;"> | |
| <p><strong>Practical considerations:</strong></p> | |
| <ul style="margin-left: 20px; margin-top: 5px;"> | |
| <li>Use diverse preservation prompts (35+ covering math, coding, reasoning, etc.)</li> | |
| <li>rank_ratio of 0.95 keeps most capability variance while allowing some modification</li> | |
| <li>Lower rank_ratio = more aggressive preservation (but less refusal removal)</li> | |
| <li>Compute P_null once per layer, reuse for all weight matrices in that layer</li> | |
| </ul> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| </div> | |
| <script> | |
| // Canvas setup | |
| const canvas = document.getElementById('mainCanvas'); | |
| const ctx = canvas.getContext('2d'); | |
| const width = canvas.width; | |
| const height = canvas.height; | |
| const centerX = width / 2; | |
| const centerY = height / 2; | |
| const scale = 200; | |
| // State | |
| let animationProgress = 0; | |
| let isAnimating = false; | |
| let currentStep = 1; | |
| // Get slider values | |
| function getK() { | |
| return { | |
| x: parseFloat(document.getElementById('kx').value) / 100, | |
| y: parseFloat(document.getElementById('ky').value) / 100 | |
| }; | |
| } | |
| function getDW() { | |
| return { | |
| x: parseFloat(document.getElementById('dwx').value) / 100, | |
| y: parseFloat(document.getElementById('dwy').value) / 100 | |
| }; | |
| } | |
| // Math helpers | |
| function dot(a, b) { | |
| return a.x * b.x + a.y * b.y; | |
| } | |
| function magnitude(v) { | |
| return Math.sqrt(v.x * v.x + v.y * v.y); | |
| } | |
| function normalize(v) { | |
| const mag = magnitude(v); | |
| return mag > 0 ? { x: v.x / mag, y: v.y / mag } : { x: 0, y: 0 }; | |
| } | |
| function projectToNullSpace(dw, k) { | |
| // Null space of K is perpendicular to K | |
| // P_null = I - k*k^T / |k|^2 | |
| const kNorm = normalize(k); | |
| const projection = dot(dw, kNorm); | |
| return { | |
| x: dw.x - projection * kNorm.x, | |
| y: dw.y - projection * kNorm.y | |
| }; | |
| } | |
| // Drawing functions | |
| function toCanvas(v) { | |
| return { | |
| x: centerX + v.x * scale, | |
| y: centerY - v.y * scale // Flip Y for standard math coordinates | |
| }; | |
| } | |
| function drawGrid() { | |
| ctx.strokeStyle = 'rgba(255, 255, 255, 0.1)'; | |
| ctx.lineWidth = 1; | |
| // Grid lines | |
| for (let i = -2; i <= 2; i++) { | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX + i * scale / 2, 0); | |
| ctx.lineTo(centerX + i * scale / 2, height); | |
| ctx.stroke(); | |
| ctx.beginPath(); | |
| ctx.moveTo(0, centerY + i * scale / 2); | |
| ctx.lineTo(width, centerY + i * scale / 2); | |
| ctx.stroke(); | |
| } | |
| // Axes | |
| ctx.strokeStyle = 'rgba(255, 255, 255, 0.3)'; | |
| ctx.lineWidth = 2; | |
| ctx.beginPath(); | |
| ctx.moveTo(0, centerY); | |
| ctx.lineTo(width, centerY); | |
| ctx.stroke(); | |
| ctx.beginPath(); | |
| ctx.moveTo(centerX, 0); | |
| ctx.lineTo(centerX, height); | |
| ctx.stroke(); | |
| } | |
| function drawVector(from, to, color, lineWidth = 3, arrowSize = 12) { | |
| const fromPt = toCanvas(from); | |
| const toPt = toCanvas(to); | |
| ctx.strokeStyle = color; | |
| ctx.fillStyle = color; | |
| ctx.lineWidth = lineWidth; | |
| // Line | |
| ctx.beginPath(); | |
| ctx.moveTo(fromPt.x, fromPt.y); | |
| ctx.lineTo(toPt.x, toPt.y); | |
| ctx.stroke(); | |
| // Arrow head | |
| const angle = Math.atan2(fromPt.y - toPt.y, fromPt.x - toPt.x); | |
| ctx.beginPath(); | |
| ctx.moveTo(toPt.x, toPt.y); | |
| ctx.lineTo( | |
| toPt.x + arrowSize * Math.cos(angle - Math.PI / 6), | |
| toPt.y + arrowSize * Math.sin(angle - Math.PI / 6) | |
| ); | |
| ctx.lineTo( | |
| toPt.x + arrowSize * Math.cos(angle + Math.PI / 6), | |
| toPt.y + arrowSize * Math.sin(angle + Math.PI / 6) | |
| ); | |
| ctx.closePath(); | |
| ctx.fill(); | |
| } | |
| function drawSubspace(direction, color, label) { | |
| const norm = normalize(direction); | |
| const perpendicular = { x: -norm.y, y: norm.x }; | |
| // Draw the line representing the subspace | |
| const lineStart = toCanvas({ x: norm.x * 2, y: norm.y * 2 }); | |
| const lineEnd = toCanvas({ x: -norm.x * 2, y: -norm.y * 2 }); | |
| ctx.strokeStyle = color; | |
| ctx.lineWidth = 2; | |
| ctx.setLineDash([5, 5]); | |
| ctx.beginPath(); | |
| ctx.moveTo(lineStart.x, lineStart.y); | |
| ctx.lineTo(lineEnd.x, lineEnd.y); | |
| ctx.stroke(); | |
| ctx.setLineDash([]); | |
| } | |
| function drawNullSpaceRegion(k) { | |
| const norm = normalize(k); | |
| const perp = { x: -norm.y, y: norm.x }; | |
| // Draw semi-transparent region for null space | |
| ctx.fillStyle = 'rgba(0, 212, 255, 0.1)'; | |
| ctx.beginPath(); | |
| const p1 = toCanvas({ x: perp.x * 2, y: perp.y * 2 }); | |
| const p2 = toCanvas({ x: -perp.x * 2, y: -perp.y * 2 }); | |
| const p3 = toCanvas({ x: -perp.x * 2 + norm.x * 0.3, y: -perp.y * 2 + norm.y * 0.3 }); | |
| const p4 = toCanvas({ x: perp.x * 2 + norm.x * 0.3, y: perp.y * 2 + norm.y * 0.3 }); | |
| ctx.moveTo(p1.x, p1.y); | |
| ctx.lineTo(p2.x, p2.y); | |
| ctx.lineTo(p3.x, p3.y); | |
| ctx.lineTo(p4.x, p4.y); | |
| ctx.closePath(); | |
| ctx.fill(); | |
| // Null space line | |
| ctx.strokeStyle = 'rgba(0, 212, 255, 0.5)'; | |
| ctx.lineWidth = 2; | |
| ctx.setLineDash([10, 5]); | |
| ctx.beginPath(); | |
| ctx.moveTo(p1.x, p1.y); | |
| ctx.lineTo(p2.x, p2.y); | |
| ctx.stroke(); | |
| ctx.setLineDash([]); | |
| } | |
| function drawRowSpaceRegion(k) { | |
| const norm = normalize(k); | |
| // Draw semi-transparent region for row space | |
| ctx.fillStyle = 'rgba(76, 175, 80, 0.1)'; | |
| ctx.beginPath(); | |
| const perp = { x: -norm.y, y: norm.x }; | |
| const p1 = toCanvas({ x: norm.x * 2, y: norm.y * 2 }); | |
| const p2 = toCanvas({ x: -norm.x * 2, y: -norm.y * 2 }); | |
| const p3 = toCanvas({ x: -norm.x * 2 + perp.x * 0.3, y: -norm.y * 2 + perp.y * 0.3 }); | |
| const p4 = toCanvas({ x: norm.x * 2 + perp.x * 0.3, y: norm.y * 2 + perp.y * 0.3 }); | |
| ctx.moveTo(p1.x, p1.y); | |
| ctx.lineTo(p2.x, p2.y); | |
| ctx.lineTo(p3.x, p3.y); | |
| ctx.lineTo(p4.x, p4.y); | |
| ctx.closePath(); | |
| ctx.fill(); | |
| // Row space line | |
| ctx.strokeStyle = 'rgba(76, 175, 80, 0.5)'; | |
| ctx.lineWidth = 2; | |
| ctx.setLineDash([10, 5]); | |
| ctx.beginPath(); | |
| ctx.moveTo(p1.x, p1.y); | |
| ctx.lineTo(p2.x, p2.y); | |
| ctx.stroke(); | |
| ctx.setLineDash([]); | |
| } | |
| function drawProjectionLine(dw, dwProjected, k) { | |
| const dwCanvas = toCanvas(dw); | |
| const dwProjCanvas = toCanvas(dwProjected); | |
| ctx.strokeStyle = 'rgba(255, 255, 255, 0.3)'; | |
| ctx.lineWidth = 1; | |
| ctx.setLineDash([3, 3]); | |
| ctx.beginPath(); | |
| ctx.moveTo(dwCanvas.x, dwCanvas.y); | |
| ctx.lineTo(dwProjCanvas.x, dwProjCanvas.y); | |
| ctx.stroke(); | |
| ctx.setLineDash([]); | |
| } | |
| function drawLabel(v, text, color, offsetX = 10, offsetY = -10, showCoords = false) { | |
| const pos = toCanvas(v); | |
| ctx.fillStyle = color; | |
| ctx.font = 'bold 14px Segoe UI'; | |
| ctx.fillText(text, pos.x + offsetX, pos.y + offsetY); | |
| // Draw coordinates below the label | |
| if (showCoords) { | |
| ctx.font = '11px Consolas, monospace'; | |
| ctx.fillStyle = 'rgba(255,255,255,0.7)'; | |
| const coordText = `[${v.x.toFixed(2)}, ${v.y.toFixed(2)}]`; | |
| ctx.fillText(coordText, pos.x + offsetX, pos.y + offsetY + 14); | |
| } | |
| } | |
| function draw() { | |
| ctx.clearRect(0, 0, width, height); | |
| const k = getK(); | |
| const dw = getDW(); | |
| const dwProjected = projectToNullSpace(dw, k); | |
| // Calculate animation progress for step 3 (vector projection phase) | |
| const step3Start = 0.4; | |
| const vectorAnimProgress = currentStep >= 3 && isAnimating | |
| ? Math.min(1, Math.max(0, (animationProgress - step3Start) / (1 - step3Start))) | |
| : 0; | |
| // Interpolate for animation (only during step 3 vector projection phase) | |
| const dwAnimated = { | |
| x: dw.x + (dwProjected.x - dw.x) * vectorAnimProgress, | |
| y: dw.y + (dwProjected.y - dw.y) * vectorAnimProgress | |
| }; | |
| drawGrid(); | |
| // Draw subspace regions based on current step | |
| if (currentStep >= 2) { | |
| drawRowSpaceRegion(k); | |
| drawNullSpaceRegion(k); | |
| } | |
| // Draw projection line | |
| if (currentStep >= 3 && !isAnimating) { | |
| drawProjectionLine(dw, dwProjected, k); | |
| } | |
| // Draw vectors | |
| const origin = { x: 0, y: 0 }; | |
| // Preservation vector K (always show with coordinates) | |
| drawVector(origin, k, '#4caf50', 4); | |
| drawLabel(k, 'K', '#4caf50', 10, -10, true); | |
| if (currentStep >= 3 && isAnimating) { | |
| // During animation: show ΔW fading and morphing into ΔW' | |
| // Draw fading original ΔW (ghost) | |
| ctx.globalAlpha = 1 - vectorAnimProgress * 0.7; | |
| drawVector(origin, dw, '#f858fb', 3); | |
| drawLabel(dw, 'ΔW', '#f858fb', 10, 15, true); | |
| ctx.globalAlpha = 1; | |
| // Draw the animating vector (transitioning from ΔW to ΔW') | |
| drawVector(origin, dwAnimated, '#00d4ff', 3); | |
| drawLabel(dwAnimated, "→ ΔW'", '#00d4ff', 10, -10, true); | |
| } else { | |
| // Original update ΔW (always show with coordinates when not in step 3+ animation) | |
| drawVector(origin, dw, '#f858fb', 3); | |
| drawLabel(dw, 'ΔW', '#f858fb', 10, 15, true); | |
| // Projected update ΔW' (show from step 3 when not animating) | |
| if (currentStep >= 3) { | |
| drawVector(origin, dwProjected, '#00d4ff', 3); | |
| drawLabel(dwProjected, "ΔW'", '#00d4ff', 10, -10, true); | |
| } | |
| } | |
| // Update results | |
| updateResults(k, dw, dwProjected); | |
| } | |
| function updateResults(k, dw, dwProjected) { | |
| const dotBefore = dot(k, dw); | |
| const dotAfter = dot(k, dwProjected); | |
| const magRatio = magnitude(dwProjected) / magnitude(dw); | |
| document.getElementById('dot-before').textContent = dotBefore.toFixed(4); | |
| document.getElementById('dot-after').textContent = dotAfter.toFixed(6); | |
| document.getElementById('magnitude-ratio').textContent = (magRatio * 100).toFixed(1) + '%'; | |
| // Color code the after value | |
| const afterEl = document.getElementById('dot-after'); | |
| afterEl.style.color = Math.abs(dotAfter) < 0.0001 ? '#4caf50' : '#f858fb'; | |
| // Update the step-by-step math breakdown | |
| updateMathDisplay(k, dw, dwProjected); | |
| } | |
| function updateMathDisplay(k, dw, dwProjected) { | |
| // Vector coordinates bar (full-width) | |
| document.getElementById('bar-kx').textContent = k.x.toFixed(2); | |
| document.getElementById('bar-ky').textContent = k.y.toFixed(2); | |
| document.getElementById('bar-dwx').textContent = dw.x.toFixed(2); | |
| document.getElementById('bar-dwy').textContent = dw.y.toFixed(2); | |
| document.getElementById('bar-dwpx').textContent = dwProjected.x.toFixed(2); | |
| document.getElementById('bar-dwpy').textContent = dwProjected.y.toFixed(2); | |
| // Dot product breakdown | |
| document.getElementById('dot-kx').textContent = k.x.toFixed(2); | |
| document.getElementById('dot-dwx').textContent = dw.x.toFixed(2); | |
| document.getElementById('dot-ky').textContent = k.y.toFixed(2); | |
| document.getElementById('dot-dwy').textContent = dw.y.toFixed(2); | |
| const term1 = k.x * dw.x; | |
| const term2 = k.y * dw.y; | |
| const dotResult = term1 + term2; | |
| document.getElementById('dot-term1').textContent = term1.toFixed(3); | |
| document.getElementById('dot-term2').textContent = term2.toFixed(3); | |
| document.getElementById('dot-result').textContent = dotResult.toFixed(3); | |
| // Projection formula | |
| const kMagSq = k.x * k.x + k.y * k.y; | |
| const scale = kMagSq > 0 ? dotResult / kMagSq : 0; | |
| const subX = scale * k.x; | |
| const subY = scale * k.y; | |
| document.getElementById('k-mag-sq').textContent = kMagSq.toFixed(3); | |
| document.getElementById('proj-scale').textContent = scale.toFixed(3); | |
| document.getElementById('proj-dwx').textContent = dw.x.toFixed(2); | |
| document.getElementById('proj-dwy').textContent = dw.y.toFixed(2); | |
| document.getElementById('proj-subx').textContent = subX.toFixed(3); | |
| document.getElementById('proj-suby').textContent = subY.toFixed(3); | |
| document.getElementById('math-dwpx').textContent = dwProjected.x.toFixed(3); | |
| document.getElementById('math-dwpy').textContent = dwProjected.y.toFixed(3); | |
| // Verification | |
| document.getElementById('ver-kx').textContent = k.x.toFixed(2); | |
| document.getElementById('ver-dwpx').textContent = dwProjected.x.toFixed(3); | |
| document.getElementById('ver-ky').textContent = k.y.toFixed(2); | |
| document.getElementById('ver-dwpy').textContent = dwProjected.y.toFixed(3); | |
| const verTerm1 = k.x * dwProjected.x; | |
| const verTerm2 = k.y * dwProjected.y; | |
| const verResult = verTerm1 + verTerm2; | |
| document.getElementById('ver-term1').textContent = verTerm1.toFixed(4); | |
| document.getElementById('ver-term2').textContent = verTerm2.toFixed(4); | |
| const verEl = document.getElementById('ver-result'); | |
| if (Math.abs(verResult) < 0.0001) { | |
| verEl.textContent = `≈ 0 ✓`; | |
| verEl.className = 'verification-pass'; | |
| } else { | |
| verEl.textContent = verResult.toFixed(6); | |
| verEl.className = 'verification-fail'; | |
| } | |
| } | |
| function animateProjection() { | |
| if (isAnimating) return; | |
| isAnimating = true; | |
| animationProgress = 0; | |
| // Start from step 1 | |
| currentStep = 1; | |
| updateStepIndicators(); | |
| // Phase timings (in progress units where 1.0 = complete) | |
| const step2Start = 0.2; // Show subspaces at 20% | |
| const step3Start = 0.4; // Start vector projection at 40% | |
| function animate() { | |
| animationProgress += 0.015; | |
| // Progress through steps based on animation progress | |
| if (animationProgress >= step2Start && currentStep < 2) { | |
| currentStep = 2; | |
| updateStepIndicators(); | |
| } | |
| if (animationProgress >= step3Start && currentStep < 3) { | |
| currentStep = 3; | |
| updateStepIndicators(); | |
| } | |
| if (animationProgress >= 1) { | |
| animationProgress = 1; | |
| isAnimating = false; | |
| currentStep = 3; | |
| updateStepIndicators(); | |
| } | |
| draw(); | |
| if (isAnimating) { | |
| requestAnimationFrame(animate); | |
| } | |
| } | |
| animate(); | |
| } | |
| function updateStepIndicators() { | |
| document.querySelectorAll('.step').forEach((el, i) => { | |
| el.classList.remove('active', 'completed'); | |
| if (i + 1 < currentStep) el.classList.add('completed'); | |
| if (i + 1 === currentStep) el.classList.add('active'); | |
| }); | |
| // Update content | |
| document.querySelectorAll('.step-content').forEach(el => el.classList.remove('active')); | |
| const contentEl = document.getElementById(`step${currentStep}-content`); | |
| if (contentEl) contentEl.classList.add('active'); | |
| } | |
| function randomize() { | |
| document.getElementById('kx').value = Math.floor(Math.random() * 200 - 100); | |
| document.getElementById('ky').value = Math.floor(Math.random() * 200 - 100); | |
| document.getElementById('dwx').value = Math.floor(Math.random() * 200 - 100); | |
| document.getElementById('dwy').value = Math.floor(Math.random() * 200 - 100); | |
| updateSliderValues(); | |
| draw(); | |
| } | |
| function updateSliderValues() { | |
| document.getElementById('kx-val').textContent = (parseFloat(document.getElementById('kx').value) / 100).toFixed(2); | |
| document.getElementById('ky-val').textContent = (parseFloat(document.getElementById('ky').value) / 100).toFixed(2); | |
| document.getElementById('dwx-val').textContent = (parseFloat(document.getElementById('dwx').value) / 100).toFixed(2); | |
| document.getElementById('dwy-val').textContent = (parseFloat(document.getElementById('dwy').value) / 100).toFixed(2); | |
| } | |
| function setStep(step) { | |
| currentStep = step; | |
| // Update step indicators | |
| document.querySelectorAll('.step').forEach((el, i) => { | |
| el.classList.remove('active', 'completed'); | |
| if (i + 1 < step) el.classList.add('completed'); | |
| if (i + 1 === step) el.classList.add('active'); | |
| }); | |
| // Update content | |
| document.querySelectorAll('.step-content').forEach(el => el.classList.remove('active')); | |
| const contentEl = document.getElementById(`step${step}-content`); | |
| if (contentEl) contentEl.classList.add('active'); | |
| draw(); | |
| } | |
| function showTab(tabId) { | |
| document.querySelectorAll('.tab').forEach(el => el.classList.remove('active')); | |
| document.querySelectorAll('.tab-content').forEach(el => el.classList.remove('active')); | |
| event.target.classList.add('active'); | |
| document.getElementById(tabId).classList.add('active'); | |
| } | |
| // Event listeners | |
| document.querySelectorAll('input[type="range"]').forEach(slider => { | |
| slider.addEventListener('input', () => { | |
| updateSliderValues(); | |
| draw(); | |
| }); | |
| }); | |
| // Initial draw | |
| updateSliderValues(); | |
| draw(); | |
| </script> | |
| </body> | |
| </html> | |