nickypro
/

sonar-sae

Model card Files Files and versions

xet

Community

nickypro commited on 13 days ago

Commit

8c7ee47

verified ·

1 Parent(s): 3efcfce

Upload sieve_bench/site/index.html with huggingface_hub

Browse files

Files changed (1) hide show

sieve_bench/site/index.html +440 -0

sieve_bench/site/index.html ADDED Viewed

	@@ -0,0 +1,440 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>SIEVE — what a sentence embedding will and won't tell you</title>
+<meta name="description" content="SIEVE is a runnable benchmark that scores how interpretable a sentence-embedding space is — what you can read, decompose, build, and edit in a frozen vector — across 26 tasks and 9 encoders, with a confound audit that refuses tasks a bag-of-words already solves.">
+<link rel="preconnect" href="https://fonts.googleapis.com">
+<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+<link href="https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,400;9..144,500;9..144,600;9..144,800&family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500;600&display=swap" rel="stylesheet">
+<style>
+:root{
+  --bg:#f6f3ec; --bg2:#efeae0; --panel:#fffdf8; --ink:#23201b; --ink2:#5c554a; --ink3:#8c8475;
+  --line:#e1dacb; --line2:#d3cab6;
+  --pass:#1d7a6b;          /* passes the sieve / readable */
+  --pass-soft:#dcefe9;
+  --diag:#b4632a;          /* diagnostic / caught by the sieve */
+  --diag-soft:#f3e4d4;
+  --accent:#1d7a6b;
+  --shadow:0 1px 2px rgba(40,34,22,.05),0 8px 28px -12px rgba(40,34,22,.16);
+  --mono:'IBM Plex Mono',ui-monospace,Menlo,monospace;
+  --serif:'Fraunces',Georgia,serif;
+  --sans:'Inter',system-ui,-apple-system,sans-serif;
+}
+@media (prefers-color-scheme:dark){:root:not([data-theme="light"]){
+  --bg:#15161a; --bg2:#191b20; --panel:#1d1f25; --ink:#e9e4da; --ink2:#a9a294; --ink3:#7d7768;
+  --line:#2a2d34; --line2:#363a43;
+  --pass:#52c4ac; --pass-soft:#15302b; --diag:#e09a5e; --diag-soft:#34261a; --accent:#52c4ac;
+  --shadow:0 1px 2px rgba(0,0,0,.3),0 12px 34px -14px rgba(0,0,0,.6);
+}}
+:root[data-theme="dark"]{
+  --bg:#15161a; --bg2:#191b20; --panel:#1d1f25; --ink:#e9e4da; --ink2:#a9a294; --ink3:#7d7768;
+  --line:#2a2d34; --line2:#363a43;
+  --pass:#52c4ac; --pass-soft:#15302b; --diag:#e09a5e; --diag-soft:#34261a; --accent:#52c4ac;
+  --shadow:0 1px 2px rgba(0,0,0,.3),0 12px 34px -14px rgba(0,0,0,.6);
+}
+*{box-sizing:border-box}
+html{scroll-behavior:smooth}
+@media (prefers-reduced-motion:reduce){html{scroll-behavior:auto}*{transition:none!important;animation:none!important}}
+body{margin:0;background:var(--bg);color:var(--ink);font-family:var(--sans);font-size:17px;line-height:1.65;-webkit-font-smoothing:antialiased;text-rendering:optimizeLegibility}
+.wrap{max-width:1080px;margin:0 auto;padding:0 24px}
+a{color:var(--accent);text-decoration:none}
+a:hover{text-decoration:underline;text-underline-offset:3px}
+h1,h2,h3{font-family:var(--serif);font-weight:600;letter-spacing:-.01em;line-height:1.12;color:var(--ink)}
+code,.mono{font-family:var(--mono)}
+.prose{max-width:68ch}
+/* nav */
+nav{position:sticky;top:0;z-index:50;background:color-mix(in srgb,var(--bg) 86%,transparent);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);border-bottom:1px solid var(--line)}
+nav .wrap{display:flex;align-items:center;gap:22px;height:56px}
+.brand{font-family:var(--serif);font-weight:600;font-size:19px;letter-spacing:.02em;margin-right:auto;display:flex;align-items:center;gap:10px}
+.brand .mesh{width:18px;height:18px;flex:none}
+nav a.nl{color:var(--ink2);font-size:14px;font-weight:500}
+nav a.nl:hover{color:var(--ink);text-decoration:none}
+.tg{background:none;border:1px solid var(--line2);color:var(--ink2);border-radius:999px;padding:5px 12px;font-size:12px;font-family:var(--mono);cursor:pointer;transition:.25s cubic-bezier(.16,1,.3,1)}
+.tg:hover{color:var(--ink);border-color:var(--ink3)}
+.tg:focus-visible{outline:2px solid var(--accent);outline-offset:2px}
+@media(max-width:680px){nav a.nl{display:none}}
+/* hero */
+header{padding:72px 0 40px;border-bottom:1px solid var(--line)}
+.kic{font-family:var(--mono);font-size:12.5px;letter-spacing:.16em;text-transform:uppercase;color:var(--pass);font-weight:500;display:flex;align-items:center;gap:9px;margin-bottom:22px}
+.kic::before{content:"";width:26px;height:1px;background:var(--pass)}
+h1.title{font-size:clamp(2.6rem,6vw,4.4rem);font-weight:800;margin:0 0 4px;line-height:1}
+h1.title .expand{display:block;font-size:clamp(1rem,2.2vw,1.35rem);font-weight:500;color:var(--ink2);font-family:var(--sans);letter-spacing:0;margin-top:18px;max-width:30ch}
+.lede{font-size:clamp(1.12rem,2vw,1.32rem);line-height:1.5;color:var(--ink);max-width:46ch;margin:30px 0 0;font-weight:400}
+.lede b{font-weight:600;color:var(--ink)}
+.herofacts{display:flex;flex-wrap:wrap;gap:0;margin-top:40px;border:1px solid var(--line);border-radius:14px;overflow:hidden;background:var(--panel);box-shadow:var(--shadow)}
+.hf{flex:1;min-width:128px;padding:18px 20px;border-right:1px solid var(--line)}
+.hf:last-child{border-right:none}
+.hf .n{font-family:var(--serif);font-weight:600;font-size:1.9rem;line-height:1;color:var(--ink)}
+.hf .l{font-size:12.5px;color:var(--ink2);margin-top:7px;line-height:1.35}
+@media(max-width:560px){.hf{flex:1 1 40%;border-bottom:1px solid var(--line)}}
+section{padding:62px 0;border-bottom:1px solid var(--line)}
+.eyebrow{font-family:var(--mono);font-size:12px;letter-spacing:.15em;text-transform:uppercase;color:var(--ink3);font-weight:500;margin:0 0 14px}
+h2{font-size:clamp(1.7rem,3.2vw,2.3rem);margin:0 0 8px}
+.sub{color:var(--ink2);font-size:1.05rem;max-width:62ch;margin:0 0 30px}
+p{margin:0 0 18px}
+.prose p{color:var(--ink);}
+.prose p.muted{color:var(--ink2)}
+/* the core idea — two cards */
+.split{display:grid;grid-template-columns:1fr 1fr;gap:18px}
+@media(max-width:720px){.split{grid-template-columns:1fr}}
+.idea{background:var(--panel);border:1px solid var(--line);border-radius:14px;padding:26px;box-shadow:var(--shadow)}
+.idea .q{font-family:var(--serif);font-size:1.25rem;color:var(--ink);font-weight:600;margin-bottom:10px}
+.idea.usual{opacity:.92}
+.idea .tag{font-family:var(--mono);font-size:11px;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);margin-bottom:14px;display:block}
+.idea.sieve{border-color:var(--pass);box-shadow:0 0 0 1px var(--pass),var(--shadow)}
+.idea.sieve .tag{color:var(--pass)}
+/* families */
+.fam{border:1px solid var(--line);border-radius:13px;overflow:hidden;margin-bottom:14px;background:var(--panel)}
+.fam>summary{list-style:none;cursor:pointer;padding:18px 22px;display:flex;align-items:center;gap:16px}
+.fam>summary::-webkit-details-marker{display:none}
+.fam .letter{font-family:var(--serif);font-weight:800;font-size:1.5rem;width:38px;height:38px;flex:none;display:grid;place-items:center;border-radius:9px;background:var(--bg2);color:var(--pass)}
+.fam .fname{font-family:var(--serif);font-weight:600;font-size:1.18rem}
+.fam .fdesc{color:var(--ink2);font-size:.92rem;margin-top:2px}
+.fam .chev{margin-left:auto;color:var(--ink3);transition:transform .3s cubic-bezier(.16,1,.3,1);font-family:var(--mono)}
+.fam[open] .chev{transform:rotate(90deg)}
+.fam .tasks{padding:2px 22px 18px}
+.trow{display:flex;gap:12px;padding:9px 0;border-top:1px solid var(--line);align-items:baseline}
+.trow .tid{font-family:var(--mono);font-size:12px;color:var(--ink3);width:38px;flex:none}
+.trow .tnm{font-weight:500;color:var(--ink);min-width:200px}
+.trow .ttx{color:var(--ink2);font-size:.92rem}
+.pill{font-family:var(--mono);font-size:9.5px;letter-spacing:.06em;text-transform:uppercase;padding:2px 7px;border-radius:999px;font-weight:600;white-space:nowrap;align-self:center}
+.pill.diag{background:var(--diag-soft);color:var(--diag)}
+.pill.gen{background:var(--bg2);color:var(--ink3)}
+@media(max-width:600px){.trow{flex-wrap:wrap}.trow .tnm{min-width:0}}
+/* audit rules */
+.rules{display:grid;grid-template-columns:repeat(3,1fr);gap:16px;margin-top:8px}
+@media(max-width:720px){.rules{grid-template-columns:1fr}}
+.rule{background:var(--panel);border:1px solid var(--line);border-radius:13px;padding:22px;box-shadow:var(--shadow)}
+.rule .rn{font-family:var(--mono);font-size:12px;color:var(--diag);font-weight:600;letter-spacing:.08em;margin-bottom:10px}
+.rule h3{font-size:1.08rem;margin:0 0 8px}
+.rule p{font-size:.93rem;color:var(--ink2);margin:0}
+.rule code{font-size:.82em;background:var(--bg2);padding:1px 5px;border-radius:4px;color:var(--ink)}
+/* leaderboard / heatmap */
+.lbwrap{background:var(--panel);border:1px solid var(--line);border-radius:14px;padding:8px;box-shadow:var(--shadow);overflow-x:auto}
+table.heat{border-collapse:collapse;width:100%;font-size:13.5px;min-width:560px}
+table.heat th,table.heat td{padding:8px 6px;text-align:center}
+table.heat thead th{font-family:var(--mono);font-size:11px;letter-spacing:.04em;color:var(--ink2);font-weight:500;border-bottom:1px solid var(--line2);position:sticky;left:auto}
+table.heat th.enc{text-align:left;font-family:var(--sans);font-weight:600;font-size:13.5px;color:var(--ink);white-space:nowrap;padding-left:14px}
+table.heat td.delta{font-family:var(--mono);font-weight:600;font-size:14px;color:var(--ink);border-left:1px solid var(--line2)}
+table.heat .cell{font-family:var(--mono);font-size:12.5px;color:#0c1410;border-radius:5px;cursor:default;font-weight:500}
+:root[data-theme="dark"] table.heat .cell, @media(prefers-color-scheme:dark){table.heat .cell{color:#eafffa}}
+table.heat tbody tr{border-bottom:1px solid var(--line)}
+table.heat tbody tr:last-child{border-bottom:none}
+table.heat tbody tr.floor th.enc{color:var(--diag)}
+.ranknum{font-family:var(--mono);color:var(--ink3);font-size:11px;margin-right:8px}
+.lbnote{font-size:12.5px;color:var(--ink2);margin-top:14px;font-family:var(--mono);line-height:1.6}
+.lbnote b{color:var(--ink)}
+.dbar{height:7px;border-radius:4px;background:var(--pass);display:inline-block;vertical-align:middle;margin-left:8px;opacity:.5}
+/* findings */
+.find{display:grid;grid-template-columns:1fr;gap:0}
+.f{padding:24px 0;border-top:1px solid var(--line);display:grid;grid-template-columns:64px 1fr;gap:20px}
+.f:first-child{border-top:none}
+.f .fn{font-family:var(--serif);font-size:2.1rem;font-weight:800;color:var(--line2);line-height:1}
+.f h3{font-size:1.2rem;margin:0 0 7px}
+.f p{margin:0;color:var(--ink2);font-size:.97rem}
+.f p b{color:var(--ink);font-weight:600}
+.f .num{font-family:var(--mono);color:var(--pass);font-weight:600}
+@media(max-width:560px){.f{grid-template-columns:1fr;gap:6px}.f .fn{font-size:1.4rem}}
+/* quickstart */
+.code{background:#15171c;border:1px solid #262a31;border-radius:12px;padding:20px 22px;font-family:var(--mono);font-size:13.5px;color:#cfd6dd;overflow-x:auto;line-height:1.85;position:relative}
+.code .c{color:#6b7785}
+.code .g{color:#7fd1bd}
+.code .y{color:#d8b673}
+.copy{position:absolute;top:12px;right:12px;background:#262a31;border:none;color:#9aa3ad;font-family:var(--mono);font-size:11px;padding:5px 10px;border-radius:6px;cursor:pointer;transition:.2s}
+.copy:hover{background:#323843;color:#dfe4ea}
+.steps{counter-reset:s;margin:0 0 26px;padding:0;list-style:none}
+.steps li{counter-increment:s;padding:4px 0 4px 38px;position:relative;color:var(--ink2)}
+.steps li::before{content:counter(s);position:absolute;left:0;top:2px;width:24px;height:24px;border-radius:50%;background:var(--bg2);border:1px solid var(--line2);color:var(--ink);font-family:var(--mono);font-size:12px;display:grid;place-items:center}
+.steps li b{color:var(--ink)}
+.callout{background:var(--pass-soft);border:1px solid color-mix(in srgb,var(--pass) 35%,transparent);border-radius:12px;padding:18px 22px;font-size:.96rem;color:var(--ink)}
+.callout b{color:var(--pass)}
+/* footer */
+footer{padding:54px 0 80px;color:var(--ink2);font-size:14px}
+footer .grid{display:flex;flex-wrap:wrap;gap:40px;margin-bottom:34px}
+footer h4{font-family:var(--mono);font-size:11px;letter-spacing:.12em;text-transform:uppercase;color:var(--ink3);margin:0 0 12px;font-weight:500}
+footer a{display:block;color:var(--ink2);margin-bottom:7px}
+footer a:hover{color:var(--accent)}
+.disc{border-top:1px solid var(--line);padding-top:24px;font-size:13px;color:var(--ink3);max-width:74ch;line-height:1.65}
+.btn{display:inline-flex;align-items:center;gap:8px;background:var(--ink);color:var(--bg);padding:11px 20px;border-radius:999px;font-weight:600;font-size:14px;font-family:var(--sans);transition:.25s cubic-bezier(.16,1,.3,1)}
+.btn:hover{text-decoration:none;transform:translateY(-1px);box-shadow:var(--shadow)}
+.btn.alt{background:transparent;color:var(--ink);border:1px solid var(--line2)}
+.btnrow{display:flex;gap:12px;flex-wrap:wrap;margin-top:30px}
+.mesh path,.mesh line{stroke:var(--pass);stroke-width:1.2;fill:none}
+</style>
+</head>
+<body>
+<nav><div class="wrap">
+  <span class="brand">
+    <svg class="mesh" viewBox="0 0 20 20" aria-hidden="true"><path d="M2 5h16M2 10h16M2 15h16M5 2v16M10 2v16M15 2v16"/></svg>
+    SIEVE
+  </span>
+  <a class="nl" href="#idea">Idea</a>
+  <a class="nl" href="#measures">Tasks</a>
+  <a class="nl" href="#audit">Audit</a>
+  <a class="nl" href="#board">Leaderboard</a>
+  <a class="nl" href="#findings">Findings</a>
+  <a class="nl" href="#run">Run it</a>
+  <button class="tg" id="tg" aria-label="Toggle colour theme">auto</button>
+</div></nav>
+<header><div class="wrap">
+  <div class="kic">Sentence-embedding Interpretability EValuation</div>
+  <h1 class="title">SIEVE
+    <span class="expand">A benchmark for what a sentence embedding will, and won't, tell you.</span>
+  </h1>
+  <p class="lede">Most benchmarks ask whether a sentence embedding is <i>good</i> — does it retrieve, cluster, rank. SIEVE asks whether it is <b>legible</b>: given one frozen 1024-d vector, what can you <b>read</b> out of it, <b>decompose</b>, <b>build</b>, or <b>edit</b> — and what is simply <b>not there</b>. It runs on any encoder, on CPU, and it is built to refuse to fool you.</p>
+  <div class="herofacts">
+    <div class="hf"><div class="n">26</div><div class="l">tasks across 5 capability families</div></div>
+    <div class="hf"><div class="n">9</div><div class="l">encoders profiled (SONAR → GloVe-bag)</div></div>
+    <div class="hf"><div class="n">5</div><div class="l">live discriminating axes, audit-gated</div></div>
+    <div class="hf"><div class="n">CPU</div><div class="l">runnable on your own HF encoder</div></div>
+  </div>
+</div></header>
+<section id="idea"><div class="wrap">
+  <p class="eyebrow">The premise</p>
+  <h2>Legible is not the same as good</h2>
+  <p class="sub">A high score on a retrieval benchmark tells you a vector is <i>useful</i>. It says nothing about whether a human — or a safety monitor — can tell what is inside it. Those are different questions, and almost nothing measures the second one.</p>
+  <div class="split">
+    <div class="idea usual">
+      <span class="tag">What benchmarks usually ask</span>
+      <div class="q">"Is this embedding good?"</div>
+      <p class="muted" style="margin:0;color:var(--ink2);font-size:.96rem">Retrieval accuracy, clustering quality, STS correlation. Optimised for downstream utility. A perfectly opaque vector can top every one of them.</p>
+    </div>
+    <div class="idea sieve">
+      <span class="tag">What SIEVE asks</span>
+      <div class="q">"Is this embedding legible?"</div>
+      <p style="margin:0;color:var(--ink2);font-size:.96rem">Can you read the words, the numbers, the order? Decompose its structure? Build one from parts? Edit one sentence of three and leave the rest? And crucially — <b style="color:var(--pass)">where does that legibility run out?</b></p>
+    </div>
+  </div>
+  <p class="prose" style="margin-top:30px;color:var(--ink2)">The name is the method. A sieve sorts what passes through from what is caught. SIEVE separates what a sentence embedding genuinely carries in a readable form from what only <i>looks</i> readable because a bag-of-words baseline already solved it. The second half is the harder, more honest measurement — and it is where the interesting science lives.</p>
+</div></section>
+<section id="measures"><div class="wrap">
+  <p class="eyebrow">What it measures</p>
+  <h2>Five families, twenty-six tasks</h2>
+  <p class="sub">Every task is normalised to 0–1 and reported against a <b>baseline</b> (random / bag-of-words / surface-position) and a <b>ceiling</b> (oracle / full-vector). Tasks marked <span class="pill diag" style="display:inline-block">diag</span> are diagnostic — a <i>low</i> score is the finding, not a failure. <span class="pill gen" style="display:inline-block">gen</span> tasks need a decoder.</p>
+  <div id="families"></div>
+</div></section>
+<section id="audit"><div class="wrap">
+  <p class="eyebrow">Why you can trust it</p>
+  <h2>The audit: a benchmark that refuses to fool itself</h2>
+  <p class="sub">The hardest failure in interpretability is a high number that means nothing — a probe that "reads" a concept that a bag-of-words could read just as well. SIEVE bakes a confound check into the harness. Every task is <b>audited</b>, and the headline averages <b>only the tasks that pass</b>.</p>
+  <div class="rules">
+    <div class="rule">
+      <div class="rn">RULE 1 · SPECIFICITY</div>
+      <h3>Not z-specific → excluded</h3>
+      <p>If a <code>bag-of-words</code> or <code>surface-position</code> baseline already reaches the score, the task isn't measuring the <i>embedding</i> — a dumb null solved it. It cannot count toward "this vector is legible."</p>
+    </div>
+    <div class="rule">
+      <div class="rn">RULE 2 · HEADROOM</div>
+      <h3>Degenerate → excluded</h3>
+      <p>If the score barely clears its own baseline (<code>Δ&lt;0.05</code>), the task is saturated or trivial. A pass with no headroom carries no ranking signal and is dropped from the comparison.</p>
+    </div>
+    <div class="rule">
+      <div class="rn">RULE 3 · AGREEMENT</div>
+      <h3>Disagreeing arms → excluded</h3>
+      <p>Tasks measured two ways (a probe arm and a decode arm) must agree to <code>±0.15</code>. A gap means one arm is buggy — so the number is suppressed until it's reconciled.</p>
+    </div>
+  </div>
+  <p class="prose" style="margin-top:28px;color:var(--ink2)">This is not theoretical. During development the audit caught the benchmark's own most-quotable headline — a "universal no-binding" result that was really a <i>broken task</i> scoring chance for everyone — and forced it to be rebuilt on natural data before it could ship. The two-track leaderboard below reports an apples-to-apples <b>intersection</b> of audit-passing tasks, never one inflated winner column.</p>
+</div></section>
+<section id="board"><div class="wrap">
+  <p class="eyebrow">The leaderboard</p>
+  <h2>Nine encoders, five live axes</h2>
+  <p class="sub">The fair comparative number — <b>Encode-Readout-Δ</b> — is the mean of <code>(score − baseline) / (ceiling − baseline)</code> over the five tasks that <i>every</i> encoder runs and audit-passes on the same probe-only arm. Hover a cell for the per-axis score. The static <b style="color:var(--diag)">GloVe-bag</b> is a literal bag-of-words — the floor, and a control that the additivity axis correctly flags.</p>
+  <div class="lbwrap">
+    <table class="heat" id="board-table"></table>
+  </div>
+  <p class="lbnote" id="lbnote"></p>
+</div></section>
+<section id="findings"><div class="wrap">
+  <p class="eyebrow">What it found</p>
+  <h2>Five honest findings</h2>
+  <p class="sub">The benchmark's first scientific use — a designed sweep of size, training objective, and architecture — produced results that were pre-registered before looking, and one claim it had to walk back through its own controls.</p>
+  <div class="find">
+    <div class="f"><div class="fn">01</div><div>
+      <h3>Interpretability is not one number</h3>
+      <p>Across the nine encoders the discriminating axes rank them <b>almost independently</b> — mean cross-axis rank-correlation <span class="num">+0.21</span>. Lexical-readout and feature-monosemanticity are even <b>anti-correlated</b> (<span class="num">−0.67</span>): the encoders best at reading words out are the <i>worst</i> at having monosemantic features. There is no single "interpretability score" — which is why SIEVE reports a profile, not a winner.</p>
+    </div></div>
+    <div class="f"><div class="fn">02</div><div>
+      <h3>No abstract role-binding — in any encoder</h3>
+      <p>Trained to read "who is the agent" on one sentence construction and tested on another, <b>every</b> encoder fails (cross-construction AUC <span class="num">0.47–0.60</span>, none above 0.70) while a surface-position baseline reads it perfectly (<span class="num">0.91</span>). Mean-pooled sentence embeddings carry <i>who</i> and <i>what</i> as content, but not <i>who-did-what-to-whom</i> as structure. Verified on 1,270 naturally-parsed sentences across five syntactic forms.</p>
+    </div></div>
+    <div class="f"><div class="fn">03</div><div>
+      <h3>Training objective reshapes <i>additivity</i>, not readout</h3>
+      <p>A masked-LM BERT and a contrastive retriever reach the <b>same</b> abstract-readout score — but contrastive training makes the pooled space far more of a literal bag-of-words. Objective controls how additive the geometry is, not how much abstract content is legible.</p>
+    </div></div>
+    <div class="f"><div class="fn">04</div><div>
+      <h3>Interpretability does not scale with size</h3>
+      <p>Across e5-small → base → large the readout score is <b>flat within seed noise</b> (<span class="num">σ≈0.016</span>, span 0.03, non-monotone). Bigger is not more legible — at least not within a family.</p>
+    </div></div>
+    <div class="f"><div class="fn">05</div><div>
+      <h3>The control behaves exactly as it should</h3>
+      <p>A literal static bag-of-GloVe-vectors is the <i>only</i> encoder flagged genuinely additive — order-sensitivity exactly <span class="num">0.000</span>, vector cosine to the word-mean <span class="num">0.985</span> — and sits at the readout floor. When your benchmark's planted negative control lands precisely where it must, the axis is calibrated. <span style="color:var(--ink3)">(And the over-claim it caught: an early "contrastive embeddings are just bags" headline was walked back once a shuffled-word + order-permutation control showed it was a length artifact.)</span></p>
+    </div></div>
+  </div>
+</div></section>
+<section id="run"><div class="wrap">
+  <p class="eyebrow">Run it</p>
+  <h2>Score your own encoder in three lines</h2>
+  <ol class="steps">
+    <li>Download <code>sieve_bench/</code> from the Hugging Face repo — no SONAR, no GPU required.</li>
+    <li><b>Point it at any model</b> via the <code>hf:&lt;id&gt;:&lt;pool&gt;</code> adapter, or implement the three-method <code>Encoder</code> interface for your own.</li>
+    <li>Read the auto-generated two-track leaderboard with its audit column.</li>
+  </ol>
+  <div class="code">
+    <button class="copy" onclick="cp(this)">copy</button>
+<span class="c"># a stranger with any HuggingFace sentence encoder, on CPU</span>
+pip install -r requirements.txt
+python run.py --encoder <span class="y">hf:sentence-transformers/all-MiniLM-L6-v2:mean</span> --tasks all
+python make_leaderboard.py   <span class="c"># → LEADERBOARD.md</span>
+  </div>
+  <div class="callout" style="margin-top:24px"><b>Decode-free by default.</b> The core readout, decomposition, and cross-lingual families run on any encoder that produces a vector. Construction and editing tasks need a decoder and skip cleanly when one isn't present — so the comparison stays fair across encoders that can't generate.</div>
+</div></section>
+<footer><div class="wrap">
+  <div class="grid">
+    <div>
+      <h4>Artifact</h4>
+      <a href="https://huggingface.co/nickypro/sonar-sae/tree/main/sieve_bench">Hugging Face · sieve_bench</a>
+      <a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/README.md">README &amp; quickstart</a>
+      <a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/SIEVE_SPEC.md">Full design spec</a>
+      <a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/LEADERBOARD.md">Raw leaderboard</a>
+    </div>
+    <div>
+      <h4>Design invariants</h4>
+      <a style="cursor:default">Baseline + ceiling + control, every task</a>
+      <a style="cursor:default">Audit-pass tasks only in the headline</a>
+      <a style="cursor:default">Diagnostic tasks: low is the finding</a>
+      <a style="cursor:default">Per-axis profile, never one winner</a>
+    </div>
+    <div>
+      <h4>The five families</h4>
+      <a style="cursor:default">A · Readout</a>
+      <a style="cursor:default">B · Decomposition</a>
+      <a style="cursor:default">C · Construction</a>
+      <a style="cursor:default">D · Editing</a>
+      <a style="cursor:default">E · Cross-lingual</a>
+    </div>
+  </div>
+  <div class="btnrow" style="margin-bottom:34px">
+    <a class="btn" href="https://huggingface.co/nickypro/sonar-sae/tree/main/sieve_bench">Get SIEVE →</a>
+    <a class="btn alt" href="#measures">See the tasks</a>
+  </div>
+  <p class="disc">This page and the SIEVE benchmark were generated by an AI research agent. Findings were produced by running code and reading the results, with pre-registered predictions and an automated confound audit; they have primarily been machine-verified and self-critiqued, not independently reproduced by a human. Numbers are read from the actual benchmark runs. "Legibility" here means probe / decode / reconstruction readout of a frozen embedding — not a claim about human perception. Reviewed adversarially by a second model during development; the benchmark walked back one over-claim and fixed one broken headline task through its own controls. Treat the comparative leaderboard as a research instrument, not a product ranking: several gaps sit within seed noise, as noted on the board.</p>
+</div></footer>
+<div style="position:fixed;top:10px;right:10px;background:rgba(0,0,0,0.45);color:rgba(255,255,255,0.7);font-size:9px;padding:2px 7px;border-radius:8px;z-index:9999;font-family:system-ui,-apple-system,sans-serif;pointer-events:none;backdrop-filter:blur(6px);-webkit-backdrop-filter:blur(6px);letter-spacing:0.3px;text-transform:uppercase;font-weight:500;">ai gen</div>
+<script>
+/* ---- theme toggle ---- */
+const modes=['auto','light','dark'];
+let mi=modes.indexOf(localStorage.getItem('theme')||'auto'); if(mi<0)mi=0;
+const tgEl=document.getElementById('tg');
+function applyTheme(){const m=modes[mi];if(m==='auto')document.documentElement.removeAttribute('data-theme');else document.documentElement.setAttribute('data-theme',m);tgEl.textContent=m;}
+tgEl.addEventListener('click',()=>{mi=(mi+1)%3;localStorage.setItem('theme',modes[mi]);applyTheme();});
+applyTheme();
+/* ---- copy ---- */
+function cp(b){const code=b.parentElement.innerText.replace(/^copy/,'').trim();navigator.clipboard.writeText(code);b.textContent='copied';setTimeout(()=>b.textContent='copy',1400);}
+/* ---- families data ---- */
+const FAM=[
+ {L:'A',n:'Readout',d:'read content out of a frozen vector',t:[
+   ['t01','lexical bag','which content words are present',''],
+   ['t02','number-exact','recover the exact numeric value',''],
+   ['t03','entity-presence','is entity X present, tested across paraphrase',''],
+   ['t04','negation-scope','negation present, and which clause it scopes',''],
+   ['t05','position / order','recover word order from the vector',''],
+   ['t06','thematic-role','is X the agent? — across constructions','diag'],
+   ['t07','meaning-coverage','rebuild meaning from readable properties','gen'],
+   ['t08','length-generalization','does a short-trained readout survive on long inputs',''],
+   ['t09','coreference','does a pronoun refer to entity X','diag']]},
+ {L:'B',n:'Decomposition',d:'how the vector is organised',t:[
+   ['t10','dimensionality','effective rank, intrinsic dimension of the manifold','diag'],
+   ['t11','position un-rotation','can a token be separated from its position','diag'],
+   ['t12','additivity','is the vector just a bag of word-vectors','diag'],
+   ['t13','SAE-monosemanticity','do sparse features carry single clean concepts',''],
+   ['t14','capacity-law','how recoverability decays with sentence length','']]},
+ {L:'C',n:'Construction',d:'build a vector from parts',t:[
+   ['t15','sentence-from-words','construct a faithful vector from shuffled words','gen'],
+   ['t16','vocab-coverage','single-word round-trip across a vocabulary','gen'],
+   ['t17','recombination','is uniform pooling already near-optimal','diag']]},
+ {L:'D',n:'Editing',d:'change a vector precisely',t:[
+   ['t18','concept-steer','add a concept direction — does it causally appear','diag'],
+   ['t22','word-edit','replace X with Y, preserving the rest','gen'],
+   ['t23','edit sentence 2 of 3','edit one clause, leave the others intact','gen'],
+   ['t24','sentence-reorder','swap the order of encoded sentences','diag'],
+   ['t25','concept-injection','inject content at the predicted capacity budget','gen'],
+   ['t26','causal-identifiability','swap agent⇄patient while preserving content','diag']]},
+ {L:'E',n:'Cross-lingual',d:'across languages and encoders',t:[
+   ['t19','cross-lingual readout','does a content probe transfer across languages',''],
+   ['t20','decode-by-language','readout quality per language','gen'],
+   ['t21','encoder generality','does the profile hold across pooling / encoders','']]}
+];
+const famHost=document.getElementById('families');
+FAM.forEach(f=>{
+  const tasks=f.t.map(t=>`<div class="trow"><span class="tid">${t[0]}</span><span class="tnm">${t[1]}</span><span class="ttx">${t[2]}</span>${t[3]==='diag'?'<span class="pill diag">diag</span>':t[3]==='gen'?'<span class="pill gen">gen</span>':''}</div>`).join('');
+  famHost.insertAdjacentHTML('beforeend',`<details class="fam"${f.L==='A'?' open':''}><summary><span class="letter">${f.L}</span><span><span class="fname">${f.n}</span><div class="fdesc">${f.d} · ${f.t.length} tasks</div></span><span class="chev">›</span></summary><div class="tasks">${tasks}</div></details>`);
+});
+/* ---- leaderboard heatmap ---- */
+const AX=['t01','t05','t08','t13','t19'];
+const AXL={t01:'lexical',t05:'order',t08:'length-gen',t13:'SAE-mono',t19:'cross-ling'};
+const ROWS=[
+ ['SONAR',0.762,{t01:0.41,t05:0.84,t08:0.73,t13:0.83,t19:1.00},false],
+ ['gte-large',0.719,{t01:0.31,t05:0.63,t08:0.70,t13:0.96,t19:1.00},false],
+ ['mpnet',0.704,{t01:0.26,t05:0.66,t08:0.71,t13:0.96,t19:0.93},false],
+ ['e5-base',0.690,{t01:0.38,t05:0.70,t08:0.56,t13:0.85,t19:0.96},false],
+ ['LaBSE',0.671,{t01:0.50,t05:0.67,t08:0.53,t13:0.66,t19:1.00},false],
+ ['e5-large',0.644,{t01:0.40,t05:0.53,t08:0.49,t13:0.80,t19:0.99},false],
+ ['e5-small',0.644,{t01:0.40,t05:0.52,t08:0.51,t13:0.80,t19:0.98},false],
+ ['bert-base',0.565,{t01:0.36,t05:0.80,t08:0.14,t13:0.91,t19:0.61},false],
+ ['GloVe-bag',0.346,{t01:0.33,t05:0.09,t08:0.06,t13:0.74,t19:0.51},true]
+];
+function lerp(a,b,t){return a+(b-a)*t;}
+function cellColor(v){ // v 0..1 → pale → pass colour
+  const t=Math.max(0,Math.min(1,v));
+  const dark=document.documentElement.getAttribute('data-theme')==='dark'|| (matchMedia('(prefers-color-scheme:dark)').matches && document.documentElement.getAttribute('data-theme')!=='light');
+  if(dark){return `rgba(82,196,172,${0.08+0.62*t})`;}
+  return `rgba(29,122,107,${0.06+0.66*t})`;
+}
+function buildBoard(){
+  const maxD=ROWS[0][1];
+  let h=`<thead><tr><th class="enc">Encoder</th>`+AX.map(a=>`<th>${AXL[a]}</th>`).join('')+`<th>Δ&nbsp;readout</th></tr></thead><tbody>`;
+  ROWS.forEach((r,i)=>{
+    const [name,delta,cells,floor]=r;
+    h+=`<tr class="${floor?'floor':''}"><th class="enc"><span class="ranknum">${String(i+1).padStart(2,'0')}</span>${name}</th>`;
+    AX.forEach(a=>{const v=cells[a];h+=`<td><span class="cell" title="${name} · ${AXL[a]} = ${v.toFixed(2)}" style="display:inline-block;min-width:42px;padding:4px 0;background:${cellColor(v)}">${v.toFixed(2)}</span></td>`;});
+    h+=`<td class="delta">${delta.toFixed(3)}<span class="dbar" style="width:${28*delta/maxD}px"></span></td></tr>`;
+  });
+  h+=`</tbody>`;
+  document.getElementById('board-table').innerHTML=h;
+}
+buildBoard();
+document.getElementById('lbnote').innerHTML='<b>How to read it.</b> Δ-readout leads with SONAR (0.762) but the gap to the field is small — second place is 0.046 behind, <b>inside the spread and within seed noise</b>. The interesting signal is the columns disagreeing: no encoder wins everywhere. mpnet &amp; gte top SAE-monosemanticity yet sit near the bottom on lexical readout; LaBSE leads lexical; SONAR leads order. That disagreement <b>is</b> finding&nbsp;01.';
+/* recolour on theme change */
+new MutationObserver(buildBoard).observe(document.documentElement,{attributes:true,attributeFilter:['data-theme']});
+matchMedia('(prefers-color-scheme:dark)').addEventListener('change',buildBoard);
+</script>
+</body>
+</html>