nickypro commited on
Commit
8c7ee47
·
verified ·
1 Parent(s): 3efcfce

Upload sieve_bench/site/index.html with huggingface_hub

Browse files
Files changed (1) hide show
  1. sieve_bench/site/index.html +440 -0
sieve_bench/site/index.html ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>SIEVE — what a sentence embedding will and won't tell you</title>
7
+ <meta name="description" content="SIEVE is a runnable benchmark that scores how interpretable a sentence-embedding space is — what you can read, decompose, build, and edit in a frozen vector — across 26 tasks and 9 encoders, with a confound audit that refuses tasks a bag-of-words already solves.">
8
+ <link rel="preconnect" href="https://fonts.googleapis.com">
9
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
10
+ <link href="https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,400;9..144,500;9..144,600;9..144,800&family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500;600&display=swap" rel="stylesheet">
11
+ <style>
12
+ :root{
13
+ --bg:#f6f3ec; --bg2:#efeae0; --panel:#fffdf8; --ink:#23201b; --ink2:#5c554a; --ink3:#8c8475;
14
+ --line:#e1dacb; --line2:#d3cab6;
15
+ --pass:#1d7a6b; /* passes the sieve / readable */
16
+ --pass-soft:#dcefe9;
17
+ --diag:#b4632a; /* diagnostic / caught by the sieve */
18
+ --diag-soft:#f3e4d4;
19
+ --accent:#1d7a6b;
20
+ --shadow:0 1px 2px rgba(40,34,22,.05),0 8px 28px -12px rgba(40,34,22,.16);
21
+ --mono:'IBM Plex Mono',ui-monospace,Menlo,monospace;
22
+ --serif:'Fraunces',Georgia,serif;
23
+ --sans:'Inter',system-ui,-apple-system,sans-serif;
24
+ }
25
+ @media (prefers-color-scheme:dark){:root:not([data-theme="light"]){
26
+ --bg:#15161a; --bg2:#191b20; --panel:#1d1f25; --ink:#e9e4da; --ink2:#a9a294; --ink3:#7d7768;
27
+ --line:#2a2d34; --line2:#363a43;
28
+ --pass:#52c4ac; --pass-soft:#15302b; --diag:#e09a5e; --diag-soft:#34261a; --accent:#52c4ac;
29
+ --shadow:0 1px 2px rgba(0,0,0,.3),0 12px 34px -14px rgba(0,0,0,.6);
30
+ }}
31
+ :root[data-theme="dark"]{
32
+ --bg:#15161a; --bg2:#191b20; --panel:#1d1f25; --ink:#e9e4da; --ink2:#a9a294; --ink3:#7d7768;
33
+ --line:#2a2d34; --line2:#363a43;
34
+ --pass:#52c4ac; --pass-soft:#15302b; --diag:#e09a5e; --diag-soft:#34261a; --accent:#52c4ac;
35
+ --shadow:0 1px 2px rgba(0,0,0,.3),0 12px 34px -14px rgba(0,0,0,.6);
36
+ }
37
+ *{box-sizing:border-box}
38
+ html{scroll-behavior:smooth}
39
+ @media (prefers-reduced-motion:reduce){html{scroll-behavior:auto}*{transition:none!important;animation:none!important}}
40
+ body{margin:0;background:var(--bg);color:var(--ink);font-family:var(--sans);font-size:17px;line-height:1.65;-webkit-font-smoothing:antialiased;text-rendering:optimizeLegibility}
41
+ .wrap{max-width:1080px;margin:0 auto;padding:0 24px}
42
+ a{color:var(--accent);text-decoration:none}
43
+ a:hover{text-decoration:underline;text-underline-offset:3px}
44
+ h1,h2,h3{font-family:var(--serif);font-weight:600;letter-spacing:-.01em;line-height:1.12;color:var(--ink)}
45
+ code,.mono{font-family:var(--mono)}
46
+ .prose{max-width:68ch}
47
+
48
+ /* nav */
49
+ nav{position:sticky;top:0;z-index:50;background:color-mix(in srgb,var(--bg) 86%,transparent);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);border-bottom:1px solid var(--line)}
50
+ nav .wrap{display:flex;align-items:center;gap:22px;height:56px}
51
+ .brand{font-family:var(--serif);font-weight:600;font-size:19px;letter-spacing:.02em;margin-right:auto;display:flex;align-items:center;gap:10px}
52
+ .brand .mesh{width:18px;height:18px;flex:none}
53
+ nav a.nl{color:var(--ink2);font-size:14px;font-weight:500}
54
+ nav a.nl:hover{color:var(--ink);text-decoration:none}
55
+ .tg{background:none;border:1px solid var(--line2);color:var(--ink2);border-radius:999px;padding:5px 12px;font-size:12px;font-family:var(--mono);cursor:pointer;transition:.25s cubic-bezier(.16,1,.3,1)}
56
+ .tg:hover{color:var(--ink);border-color:var(--ink3)}
57
+ .tg:focus-visible{outline:2px solid var(--accent);outline-offset:2px}
58
+ @media(max-width:680px){nav a.nl{display:none}}
59
+
60
+ /* hero */
61
+ header{padding:72px 0 40px;border-bottom:1px solid var(--line)}
62
+ .kic{font-family:var(--mono);font-size:12.5px;letter-spacing:.16em;text-transform:uppercase;color:var(--pass);font-weight:500;display:flex;align-items:center;gap:9px;margin-bottom:22px}
63
+ .kic::before{content:"";width:26px;height:1px;background:var(--pass)}
64
+ h1.title{font-size:clamp(2.6rem,6vw,4.4rem);font-weight:800;margin:0 0 4px;line-height:1}
65
+ h1.title .expand{display:block;font-size:clamp(1rem,2.2vw,1.35rem);font-weight:500;color:var(--ink2);font-family:var(--sans);letter-spacing:0;margin-top:18px;max-width:30ch}
66
+ .lede{font-size:clamp(1.12rem,2vw,1.32rem);line-height:1.5;color:var(--ink);max-width:46ch;margin:30px 0 0;font-weight:400}
67
+ .lede b{font-weight:600;color:var(--ink)}
68
+ .herofacts{display:flex;flex-wrap:wrap;gap:0;margin-top:40px;border:1px solid var(--line);border-radius:14px;overflow:hidden;background:var(--panel);box-shadow:var(--shadow)}
69
+ .hf{flex:1;min-width:128px;padding:18px 20px;border-right:1px solid var(--line)}
70
+ .hf:last-child{border-right:none}
71
+ .hf .n{font-family:var(--serif);font-weight:600;font-size:1.9rem;line-height:1;color:var(--ink)}
72
+ .hf .l{font-size:12.5px;color:var(--ink2);margin-top:7px;line-height:1.35}
73
+ @media(max-width:560px){.hf{flex:1 1 40%;border-bottom:1px solid var(--line)}}
74
+
75
+ section{padding:62px 0;border-bottom:1px solid var(--line)}
76
+ .eyebrow{font-family:var(--mono);font-size:12px;letter-spacing:.15em;text-transform:uppercase;color:var(--ink3);font-weight:500;margin:0 0 14px}
77
+ h2{font-size:clamp(1.7rem,3.2vw,2.3rem);margin:0 0 8px}
78
+ .sub{color:var(--ink2);font-size:1.05rem;max-width:62ch;margin:0 0 30px}
79
+ p{margin:0 0 18px}
80
+ .prose p{color:var(--ink);}
81
+ .prose p.muted{color:var(--ink2)}
82
+
83
+ /* the core idea — two cards */
84
+ .split{display:grid;grid-template-columns:1fr 1fr;gap:18px}
85
+ @media(max-width:720px){.split{grid-template-columns:1fr}}
86
+ .idea{background:var(--panel);border:1px solid var(--line);border-radius:14px;padding:26px;box-shadow:var(--shadow)}
87
+ .idea .q{font-family:var(--serif);font-size:1.25rem;color:var(--ink);font-weight:600;margin-bottom:10px}
88
+ .idea.usual{opacity:.92}
89
+ .idea .tag{font-family:var(--mono);font-size:11px;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);margin-bottom:14px;display:block}
90
+ .idea.sieve{border-color:var(--pass);box-shadow:0 0 0 1px var(--pass),var(--shadow)}
91
+ .idea.sieve .tag{color:var(--pass)}
92
+
93
+ /* families */
94
+ .fam{border:1px solid var(--line);border-radius:13px;overflow:hidden;margin-bottom:14px;background:var(--panel)}
95
+ .fam>summary{list-style:none;cursor:pointer;padding:18px 22px;display:flex;align-items:center;gap:16px}
96
+ .fam>summary::-webkit-details-marker{display:none}
97
+ .fam .letter{font-family:var(--serif);font-weight:800;font-size:1.5rem;width:38px;height:38px;flex:none;display:grid;place-items:center;border-radius:9px;background:var(--bg2);color:var(--pass)}
98
+ .fam .fname{font-family:var(--serif);font-weight:600;font-size:1.18rem}
99
+ .fam .fdesc{color:var(--ink2);font-size:.92rem;margin-top:2px}
100
+ .fam .chev{margin-left:auto;color:var(--ink3);transition:transform .3s cubic-bezier(.16,1,.3,1);font-family:var(--mono)}
101
+ .fam[open] .chev{transform:rotate(90deg)}
102
+ .fam .tasks{padding:2px 22px 18px}
103
+ .trow{display:flex;gap:12px;padding:9px 0;border-top:1px solid var(--line);align-items:baseline}
104
+ .trow .tid{font-family:var(--mono);font-size:12px;color:var(--ink3);width:38px;flex:none}
105
+ .trow .tnm{font-weight:500;color:var(--ink);min-width:200px}
106
+ .trow .ttx{color:var(--ink2);font-size:.92rem}
107
+ .pill{font-family:var(--mono);font-size:9.5px;letter-spacing:.06em;text-transform:uppercase;padding:2px 7px;border-radius:999px;font-weight:600;white-space:nowrap;align-self:center}
108
+ .pill.diag{background:var(--diag-soft);color:var(--diag)}
109
+ .pill.gen{background:var(--bg2);color:var(--ink3)}
110
+ @media(max-width:600px){.trow{flex-wrap:wrap}.trow .tnm{min-width:0}}
111
+
112
+ /* audit rules */
113
+ .rules{display:grid;grid-template-columns:repeat(3,1fr);gap:16px;margin-top:8px}
114
+ @media(max-width:720px){.rules{grid-template-columns:1fr}}
115
+ .rule{background:var(--panel);border:1px solid var(--line);border-radius:13px;padding:22px;box-shadow:var(--shadow)}
116
+ .rule .rn{font-family:var(--mono);font-size:12px;color:var(--diag);font-weight:600;letter-spacing:.08em;margin-bottom:10px}
117
+ .rule h3{font-size:1.08rem;margin:0 0 8px}
118
+ .rule p{font-size:.93rem;color:var(--ink2);margin:0}
119
+ .rule code{font-size:.82em;background:var(--bg2);padding:1px 5px;border-radius:4px;color:var(--ink)}
120
+
121
+ /* leaderboard / heatmap */
122
+ .lbwrap{background:var(--panel);border:1px solid var(--line);border-radius:14px;padding:8px;box-shadow:var(--shadow);overflow-x:auto}
123
+ table.heat{border-collapse:collapse;width:100%;font-size:13.5px;min-width:560px}
124
+ table.heat th,table.heat td{padding:8px 6px;text-align:center}
125
+ table.heat thead th{font-family:var(--mono);font-size:11px;letter-spacing:.04em;color:var(--ink2);font-weight:500;border-bottom:1px solid var(--line2);position:sticky;left:auto}
126
+ table.heat th.enc{text-align:left;font-family:var(--sans);font-weight:600;font-size:13.5px;color:var(--ink);white-space:nowrap;padding-left:14px}
127
+ table.heat td.delta{font-family:var(--mono);font-weight:600;font-size:14px;color:var(--ink);border-left:1px solid var(--line2)}
128
+ table.heat .cell{font-family:var(--mono);font-size:12.5px;color:#0c1410;border-radius:5px;cursor:default;font-weight:500}
129
+ :root[data-theme="dark"] table.heat .cell, @media(prefers-color-scheme:dark){table.heat .cell{color:#eafffa}}
130
+ table.heat tbody tr{border-bottom:1px solid var(--line)}
131
+ table.heat tbody tr:last-child{border-bottom:none}
132
+ table.heat tbody tr.floor th.enc{color:var(--diag)}
133
+ .ranknum{font-family:var(--mono);color:var(--ink3);font-size:11px;margin-right:8px}
134
+ .lbnote{font-size:12.5px;color:var(--ink2);margin-top:14px;font-family:var(--mono);line-height:1.6}
135
+ .lbnote b{color:var(--ink)}
136
+ .dbar{height:7px;border-radius:4px;background:var(--pass);display:inline-block;vertical-align:middle;margin-left:8px;opacity:.5}
137
+
138
+ /* findings */
139
+ .find{display:grid;grid-template-columns:1fr;gap:0}
140
+ .f{padding:24px 0;border-top:1px solid var(--line);display:grid;grid-template-columns:64px 1fr;gap:20px}
141
+ .f:first-child{border-top:none}
142
+ .f .fn{font-family:var(--serif);font-size:2.1rem;font-weight:800;color:var(--line2);line-height:1}
143
+ .f h3{font-size:1.2rem;margin:0 0 7px}
144
+ .f p{margin:0;color:var(--ink2);font-size:.97rem}
145
+ .f p b{color:var(--ink);font-weight:600}
146
+ .f .num{font-family:var(--mono);color:var(--pass);font-weight:600}
147
+ @media(max-width:560px){.f{grid-template-columns:1fr;gap:6px}.f .fn{font-size:1.4rem}}
148
+
149
+ /* quickstart */
150
+ .code{background:#15171c;border:1px solid #262a31;border-radius:12px;padding:20px 22px;font-family:var(--mono);font-size:13.5px;color:#cfd6dd;overflow-x:auto;line-height:1.85;position:relative}
151
+ .code .c{color:#6b7785}
152
+ .code .g{color:#7fd1bd}
153
+ .code .y{color:#d8b673}
154
+ .copy{position:absolute;top:12px;right:12px;background:#262a31;border:none;color:#9aa3ad;font-family:var(--mono);font-size:11px;padding:5px 10px;border-radius:6px;cursor:pointer;transition:.2s}
155
+ .copy:hover{background:#323843;color:#dfe4ea}
156
+ .steps{counter-reset:s;margin:0 0 26px;padding:0;list-style:none}
157
+ .steps li{counter-increment:s;padding:4px 0 4px 38px;position:relative;color:var(--ink2)}
158
+ .steps li::before{content:counter(s);position:absolute;left:0;top:2px;width:24px;height:24px;border-radius:50%;background:var(--bg2);border:1px solid var(--line2);color:var(--ink);font-family:var(--mono);font-size:12px;display:grid;place-items:center}
159
+ .steps li b{color:var(--ink)}
160
+
161
+ .callout{background:var(--pass-soft);border:1px solid color-mix(in srgb,var(--pass) 35%,transparent);border-radius:12px;padding:18px 22px;font-size:.96rem;color:var(--ink)}
162
+ .callout b{color:var(--pass)}
163
+
164
+ /* footer */
165
+ footer{padding:54px 0 80px;color:var(--ink2);font-size:14px}
166
+ footer .grid{display:flex;flex-wrap:wrap;gap:40px;margin-bottom:34px}
167
+ footer h4{font-family:var(--mono);font-size:11px;letter-spacing:.12em;text-transform:uppercase;color:var(--ink3);margin:0 0 12px;font-weight:500}
168
+ footer a{display:block;color:var(--ink2);margin-bottom:7px}
169
+ footer a:hover{color:var(--accent)}
170
+ .disc{border-top:1px solid var(--line);padding-top:24px;font-size:13px;color:var(--ink3);max-width:74ch;line-height:1.65}
171
+ .btn{display:inline-flex;align-items:center;gap:8px;background:var(--ink);color:var(--bg);padding:11px 20px;border-radius:999px;font-weight:600;font-size:14px;font-family:var(--sans);transition:.25s cubic-bezier(.16,1,.3,1)}
172
+ .btn:hover{text-decoration:none;transform:translateY(-1px);box-shadow:var(--shadow)}
173
+ .btn.alt{background:transparent;color:var(--ink);border:1px solid var(--line2)}
174
+ .btnrow{display:flex;gap:12px;flex-wrap:wrap;margin-top:30px}
175
+ .mesh path,.mesh line{stroke:var(--pass);stroke-width:1.2;fill:none}
176
+ </style>
177
+ </head>
178
+ <body>
179
+
180
+ <nav><div class="wrap">
181
+ <span class="brand">
182
+ <svg class="mesh" viewBox="0 0 20 20" aria-hidden="true"><path d="M2 5h16M2 10h16M2 15h16M5 2v16M10 2v16M15 2v16"/></svg>
183
+ SIEVE
184
+ </span>
185
+ <a class="nl" href="#idea">Idea</a>
186
+ <a class="nl" href="#measures">Tasks</a>
187
+ <a class="nl" href="#audit">Audit</a>
188
+ <a class="nl" href="#board">Leaderboard</a>
189
+ <a class="nl" href="#findings">Findings</a>
190
+ <a class="nl" href="#run">Run it</a>
191
+ <button class="tg" id="tg" aria-label="Toggle colour theme">auto</button>
192
+ </div></nav>
193
+
194
+ <header><div class="wrap">
195
+ <div class="kic">Sentence-embedding Interpretability EValuation</div>
196
+ <h1 class="title">SIEVE
197
+ <span class="expand">A benchmark for what a sentence embedding will, and won't, tell you.</span>
198
+ </h1>
199
+ <p class="lede">Most benchmarks ask whether a sentence embedding is <i>good</i> — does it retrieve, cluster, rank. SIEVE asks whether it is <b>legible</b>: given one frozen 1024-d vector, what can you <b>read</b> out of it, <b>decompose</b>, <b>build</b>, or <b>edit</b> — and what is simply <b>not there</b>. It runs on any encoder, on CPU, and it is built to refuse to fool you.</p>
200
+ <div class="herofacts">
201
+ <div class="hf"><div class="n">26</div><div class="l">tasks across 5 capability families</div></div>
202
+ <div class="hf"><div class="n">9</div><div class="l">encoders profiled (SONAR → GloVe-bag)</div></div>
203
+ <div class="hf"><div class="n">5</div><div class="l">live discriminating axes, audit-gated</div></div>
204
+ <div class="hf"><div class="n">CPU</div><div class="l">runnable on your own HF encoder</div></div>
205
+ </div>
206
+ </div></header>
207
+
208
+ <section id="idea"><div class="wrap">
209
+ <p class="eyebrow">The premise</p>
210
+ <h2>Legible is not the same as good</h2>
211
+ <p class="sub">A high score on a retrieval benchmark tells you a vector is <i>useful</i>. It says nothing about whether a human — or a safety monitor — can tell what is inside it. Those are different questions, and almost nothing measures the second one.</p>
212
+ <div class="split">
213
+ <div class="idea usual">
214
+ <span class="tag">What benchmarks usually ask</span>
215
+ <div class="q">"Is this embedding good?"</div>
216
+ <p class="muted" style="margin:0;color:var(--ink2);font-size:.96rem">Retrieval accuracy, clustering quality, STS correlation. Optimised for downstream utility. A perfectly opaque vector can top every one of them.</p>
217
+ </div>
218
+ <div class="idea sieve">
219
+ <span class="tag">What SIEVE asks</span>
220
+ <div class="q">"Is this embedding legible?"</div>
221
+ <p style="margin:0;color:var(--ink2);font-size:.96rem">Can you read the words, the numbers, the order? Decompose its structure? Build one from parts? Edit one sentence of three and leave the rest? And crucially — <b style="color:var(--pass)">where does that legibility run out?</b></p>
222
+ </div>
223
+ </div>
224
+ <p class="prose" style="margin-top:30px;color:var(--ink2)">The name is the method. A sieve sorts what passes through from what is caught. SIEVE separates what a sentence embedding genuinely carries in a readable form from what only <i>looks</i> readable because a bag-of-words baseline already solved it. The second half is the harder, more honest measurement — and it is where the interesting science lives.</p>
225
+ </div></section>
226
+
227
+ <section id="measures"><div class="wrap">
228
+ <p class="eyebrow">What it measures</p>
229
+ <h2>Five families, twenty-six tasks</h2>
230
+ <p class="sub">Every task is normalised to 0–1 and reported against a <b>baseline</b> (random / bag-of-words / surface-position) and a <b>ceiling</b> (oracle / full-vector). Tasks marked <span class="pill diag" style="display:inline-block">diag</span> are diagnostic — a <i>low</i> score is the finding, not a failure. <span class="pill gen" style="display:inline-block">gen</span> tasks need a decoder.</p>
231
+ <div id="families"></div>
232
+ </div></section>
233
+
234
+ <section id="audit"><div class="wrap">
235
+ <p class="eyebrow">Why you can trust it</p>
236
+ <h2>The audit: a benchmark that refuses to fool itself</h2>
237
+ <p class="sub">The hardest failure in interpretability is a high number that means nothing — a probe that "reads" a concept that a bag-of-words could read just as well. SIEVE bakes a confound check into the harness. Every task is <b>audited</b>, and the headline averages <b>only the tasks that pass</b>.</p>
238
+ <div class="rules">
239
+ <div class="rule">
240
+ <div class="rn">RULE 1 · SPECIFICITY</div>
241
+ <h3>Not z-specific → excluded</h3>
242
+ <p>If a <code>bag-of-words</code> or <code>surface-position</code> baseline already reaches the score, the task isn't measuring the <i>embedding</i> — a dumb null solved it. It cannot count toward "this vector is legible."</p>
243
+ </div>
244
+ <div class="rule">
245
+ <div class="rn">RULE 2 · HEADROOM</div>
246
+ <h3>Degenerate → excluded</h3>
247
+ <p>If the score barely clears its own baseline (<code>Δ&lt;0.05</code>), the task is saturated or trivial. A pass with no headroom carries no ranking signal and is dropped from the comparison.</p>
248
+ </div>
249
+ <div class="rule">
250
+ <div class="rn">RULE 3 · AGREEMENT</div>
251
+ <h3>Disagreeing arms → excluded</h3>
252
+ <p>Tasks measured two ways (a probe arm and a decode arm) must agree to <code>±0.15</code>. A gap means one arm is buggy — so the number is suppressed until it's reconciled.</p>
253
+ </div>
254
+ </div>
255
+ <p class="prose" style="margin-top:28px;color:var(--ink2)">This is not theoretical. During development the audit caught the benchmark's own most-quotable headline — a "universal no-binding" result that was really a <i>broken task</i> scoring chance for everyone — and forced it to be rebuilt on natural data before it could ship. The two-track leaderboard below reports an apples-to-apples <b>intersection</b> of audit-passing tasks, never one inflated winner column.</p>
256
+ </div></section>
257
+
258
+ <section id="board"><div class="wrap">
259
+ <p class="eyebrow">The leaderboard</p>
260
+ <h2>Nine encoders, five live axes</h2>
261
+ <p class="sub">The fair comparative number — <b>Encode-Readout-Δ</b> — is the mean of <code>(score − baseline) / (ceiling − baseline)</code> over the five tasks that <i>every</i> encoder runs and audit-passes on the same probe-only arm. Hover a cell for the per-axis score. The static <b style="color:var(--diag)">GloVe-bag</b> is a literal bag-of-words — the floor, and a control that the additivity axis correctly flags.</p>
262
+ <div class="lbwrap">
263
+ <table class="heat" id="board-table"></table>
264
+ </div>
265
+ <p class="lbnote" id="lbnote"></p>
266
+ </div></section>
267
+
268
+ <section id="findings"><div class="wrap">
269
+ <p class="eyebrow">What it found</p>
270
+ <h2>Five honest findings</h2>
271
+ <p class="sub">The benchmark's first scientific use — a designed sweep of size, training objective, and architecture — produced results that were pre-registered before looking, and one claim it had to walk back through its own controls.</p>
272
+ <div class="find">
273
+ <div class="f"><div class="fn">01</div><div>
274
+ <h3>Interpretability is not one number</h3>
275
+ <p>Across the nine encoders the discriminating axes rank them <b>almost independently</b> — mean cross-axis rank-correlation <span class="num">+0.21</span>. Lexical-readout and feature-monosemanticity are even <b>anti-correlated</b> (<span class="num">−0.67</span>): the encoders best at reading words out are the <i>worst</i> at having monosemantic features. There is no single "interpretability score" — which is why SIEVE reports a profile, not a winner.</p>
276
+ </div></div>
277
+ <div class="f"><div class="fn">02</div><div>
278
+ <h3>No abstract role-binding — in any encoder</h3>
279
+ <p>Trained to read "who is the agent" on one sentence construction and tested on another, <b>every</b> encoder fails (cross-construction AUC <span class="num">0.47–0.60</span>, none above 0.70) while a surface-position baseline reads it perfectly (<span class="num">0.91</span>). Mean-pooled sentence embeddings carry <i>who</i> and <i>what</i> as content, but not <i>who-did-what-to-whom</i> as structure. Verified on 1,270 naturally-parsed sentences across five syntactic forms.</p>
280
+ </div></div>
281
+ <div class="f"><div class="fn">03</div><div>
282
+ <h3>Training objective reshapes <i>additivity</i>, not readout</h3>
283
+ <p>A masked-LM BERT and a contrastive retriever reach the <b>same</b> abstract-readout score — but contrastive training makes the pooled space far more of a literal bag-of-words. Objective controls how additive the geometry is, not how much abstract content is legible.</p>
284
+ </div></div>
285
+ <div class="f"><div class="fn">04</div><div>
286
+ <h3>Interpretability does not scale with size</h3>
287
+ <p>Across e5-small → base → large the readout score is <b>flat within seed noise</b> (<span class="num">σ≈0.016</span>, span 0.03, non-monotone). Bigger is not more legible — at least not within a family.</p>
288
+ </div></div>
289
+ <div class="f"><div class="fn">05</div><div>
290
+ <h3>The control behaves exactly as it should</h3>
291
+ <p>A literal static bag-of-GloVe-vectors is the <i>only</i> encoder flagged genuinely additive — order-sensitivity exactly <span class="num">0.000</span>, vector cosine to the word-mean <span class="num">0.985</span> — and sits at the readout floor. When your benchmark's planted negative control lands precisely where it must, the axis is calibrated. <span style="color:var(--ink3)">(And the over-claim it caught: an early "contrastive embeddings are just bags" headline was walked back once a shuffled-word + order-permutation control showed it was a length artifact.)</span></p>
292
+ </div></div>
293
+ </div>
294
+ </div></section>
295
+
296
+ <section id="run"><div class="wrap">
297
+ <p class="eyebrow">Run it</p>
298
+ <h2>Score your own encoder in three lines</h2>
299
+ <ol class="steps">
300
+ <li>Download <code>sieve_bench/</code> from the Hugging Face repo — no SONAR, no GPU required.</li>
301
+ <li><b>Point it at any model</b> via the <code>hf:&lt;id&gt;:&lt;pool&gt;</code> adapter, or implement the three-method <code>Encoder</code> interface for your own.</li>
302
+ <li>Read the auto-generated two-track leaderboard with its audit column.</li>
303
+ </ol>
304
+ <div class="code">
305
+ <button class="copy" onclick="cp(this)">copy</button>
306
+ <span class="c"># a stranger with any HuggingFace sentence encoder, on CPU</span>
307
+ pip install -r requirements.txt
308
+ python run.py --encoder <span class="y">hf:sentence-transformers/all-MiniLM-L6-v2:mean</span> --tasks all
309
+ python make_leaderboard.py <span class="c"># → LEADERBOARD.md</span>
310
+ </div>
311
+ <div class="callout" style="margin-top:24px"><b>Decode-free by default.</b> The core readout, decomposition, and cross-lingual families run on any encoder that produces a vector. Construction and editing tasks need a decoder and skip cleanly when one isn't present — so the comparison stays fair across encoders that can't generate.</div>
312
+ </div></section>
313
+
314
+ <footer><div class="wrap">
315
+ <div class="grid">
316
+ <div>
317
+ <h4>Artifact</h4>
318
+ <a href="https://huggingface.co/nickypro/sonar-sae/tree/main/sieve_bench">Hugging Face · sieve_bench</a>
319
+ <a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/README.md">README &amp; quickstart</a>
320
+ <a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/SIEVE_SPEC.md">Full design spec</a>
321
+ <a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/LEADERBOARD.md">Raw leaderboard</a>
322
+ </div>
323
+ <div>
324
+ <h4>Design invariants</h4>
325
+ <a style="cursor:default">Baseline + ceiling + control, every task</a>
326
+ <a style="cursor:default">Audit-pass tasks only in the headline</a>
327
+ <a style="cursor:default">Diagnostic tasks: low is the finding</a>
328
+ <a style="cursor:default">Per-axis profile, never one winner</a>
329
+ </div>
330
+ <div>
331
+ <h4>The five families</h4>
332
+ <a style="cursor:default">A · Readout</a>
333
+ <a style="cursor:default">B · Decomposition</a>
334
+ <a style="cursor:default">C · Construction</a>
335
+ <a style="cursor:default">D · Editing</a>
336
+ <a style="cursor:default">E · Cross-lingual</a>
337
+ </div>
338
+ </div>
339
+ <div class="btnrow" style="margin-bottom:34px">
340
+ <a class="btn" href="https://huggingface.co/nickypro/sonar-sae/tree/main/sieve_bench">Get SIEVE →</a>
341
+ <a class="btn alt" href="#measures">See the tasks</a>
342
+ </div>
343
+ <p class="disc">This page and the SIEVE benchmark were generated by an AI research agent. Findings were produced by running code and reading the results, with pre-registered predictions and an automated confound audit; they have primarily been machine-verified and self-critiqued, not independently reproduced by a human. Numbers are read from the actual benchmark runs. "Legibility" here means probe / decode / reconstruction readout of a frozen embedding — not a claim about human perception. Reviewed adversarially by a second model during development; the benchmark walked back one over-claim and fixed one broken headline task through its own controls. Treat the comparative leaderboard as a research instrument, not a product ranking: several gaps sit within seed noise, as noted on the board.</p>
344
+ </div></footer>
345
+
346
+ <div style="position:fixed;top:10px;right:10px;background:rgba(0,0,0,0.45);color:rgba(255,255,255,0.7);font-size:9px;padding:2px 7px;border-radius:8px;z-index:9999;font-family:system-ui,-apple-system,sans-serif;pointer-events:none;backdrop-filter:blur(6px);-webkit-backdrop-filter:blur(6px);letter-spacing:0.3px;text-transform:uppercase;font-weight:500;">ai gen</div>
347
+
348
+ <script>
349
+ /* ---- theme toggle ---- */
350
+ const modes=['auto','light','dark'];
351
+ let mi=modes.indexOf(localStorage.getItem('theme')||'auto'); if(mi<0)mi=0;
352
+ const tgEl=document.getElementById('tg');
353
+ function applyTheme(){const m=modes[mi];if(m==='auto')document.documentElement.removeAttribute('data-theme');else document.documentElement.setAttribute('data-theme',m);tgEl.textContent=m;}
354
+ tgEl.addEventListener('click',()=>{mi=(mi+1)%3;localStorage.setItem('theme',modes[mi]);applyTheme();});
355
+ applyTheme();
356
+
357
+ /* ---- copy ---- */
358
+ function cp(b){const code=b.parentElement.innerText.replace(/^copy/,'').trim();navigator.clipboard.writeText(code);b.textContent='copied';setTimeout(()=>b.textContent='copy',1400);}
359
+
360
+ /* ---- families data ---- */
361
+ const FAM=[
362
+ {L:'A',n:'Readout',d:'read content out of a frozen vector',t:[
363
+ ['t01','lexical bag','which content words are present',''],
364
+ ['t02','number-exact','recover the exact numeric value',''],
365
+ ['t03','entity-presence','is entity X present, tested across paraphrase',''],
366
+ ['t04','negation-scope','negation present, and which clause it scopes',''],
367
+ ['t05','position / order','recover word order from the vector',''],
368
+ ['t06','thematic-role','is X the agent? — across constructions','diag'],
369
+ ['t07','meaning-coverage','rebuild meaning from readable properties','gen'],
370
+ ['t08','length-generalization','does a short-trained readout survive on long inputs',''],
371
+ ['t09','coreference','does a pronoun refer to entity X','diag']]},
372
+ {L:'B',n:'Decomposition',d:'how the vector is organised',t:[
373
+ ['t10','dimensionality','effective rank, intrinsic dimension of the manifold','diag'],
374
+ ['t11','position un-rotation','can a token be separated from its position','diag'],
375
+ ['t12','additivity','is the vector just a bag of word-vectors','diag'],
376
+ ['t13','SAE-monosemanticity','do sparse features carry single clean concepts',''],
377
+ ['t14','capacity-law','how recoverability decays with sentence length','']]},
378
+ {L:'C',n:'Construction',d:'build a vector from parts',t:[
379
+ ['t15','sentence-from-words','construct a faithful vector from shuffled words','gen'],
380
+ ['t16','vocab-coverage','single-word round-trip across a vocabulary','gen'],
381
+ ['t17','recombination','is uniform pooling already near-optimal','diag']]},
382
+ {L:'D',n:'Editing',d:'change a vector precisely',t:[
383
+ ['t18','concept-steer','add a concept direction — does it causally appear','diag'],
384
+ ['t22','word-edit','replace X with Y, preserving the rest','gen'],
385
+ ['t23','edit sentence 2 of 3','edit one clause, leave the others intact','gen'],
386
+ ['t24','sentence-reorder','swap the order of encoded sentences','diag'],
387
+ ['t25','concept-injection','inject content at the predicted capacity budget','gen'],
388
+ ['t26','causal-identifiability','swap agent⇄patient while preserving content','diag']]},
389
+ {L:'E',n:'Cross-lingual',d:'across languages and encoders',t:[
390
+ ['t19','cross-lingual readout','does a content probe transfer across languages',''],
391
+ ['t20','decode-by-language','readout quality per language','gen'],
392
+ ['t21','encoder generality','does the profile hold across pooling / encoders','']]}
393
+ ];
394
+ const famHost=document.getElementById('families');
395
+ FAM.forEach(f=>{
396
+ const tasks=f.t.map(t=>`<div class="trow"><span class="tid">${t[0]}</span><span class="tnm">${t[1]}</span><span class="ttx">${t[2]}</span>${t[3]==='diag'?'<span class="pill diag">diag</span>':t[3]==='gen'?'<span class="pill gen">gen</span>':''}</div>`).join('');
397
+ famHost.insertAdjacentHTML('beforeend',`<details class="fam"${f.L==='A'?' open':''}><summary><span class="letter">${f.L}</span><span><span class="fname">${f.n}</span><div class="fdesc">${f.d} · ${f.t.length} tasks</div></span><span class="chev">›</span></summary><div class="tasks">${tasks}</div></details>`);
398
+ });
399
+
400
+ /* ---- leaderboard heatmap ---- */
401
+ const AX=['t01','t05','t08','t13','t19'];
402
+ const AXL={t01:'lexical',t05:'order',t08:'length-gen',t13:'SAE-mono',t19:'cross-ling'};
403
+ const ROWS=[
404
+ ['SONAR',0.762,{t01:0.41,t05:0.84,t08:0.73,t13:0.83,t19:1.00},false],
405
+ ['gte-large',0.719,{t01:0.31,t05:0.63,t08:0.70,t13:0.96,t19:1.00},false],
406
+ ['mpnet',0.704,{t01:0.26,t05:0.66,t08:0.71,t13:0.96,t19:0.93},false],
407
+ ['e5-base',0.690,{t01:0.38,t05:0.70,t08:0.56,t13:0.85,t19:0.96},false],
408
+ ['LaBSE',0.671,{t01:0.50,t05:0.67,t08:0.53,t13:0.66,t19:1.00},false],
409
+ ['e5-large',0.644,{t01:0.40,t05:0.53,t08:0.49,t13:0.80,t19:0.99},false],
410
+ ['e5-small',0.644,{t01:0.40,t05:0.52,t08:0.51,t13:0.80,t19:0.98},false],
411
+ ['bert-base',0.565,{t01:0.36,t05:0.80,t08:0.14,t13:0.91,t19:0.61},false],
412
+ ['GloVe-bag',0.346,{t01:0.33,t05:0.09,t08:0.06,t13:0.74,t19:0.51},true]
413
+ ];
414
+ function lerp(a,b,t){return a+(b-a)*t;}
415
+ function cellColor(v){ // v 0..1 → pale → pass colour
416
+ const t=Math.max(0,Math.min(1,v));
417
+ const dark=document.documentElement.getAttribute('data-theme')==='dark'|| (matchMedia('(prefers-color-scheme:dark)').matches && document.documentElement.getAttribute('data-theme')!=='light');
418
+ if(dark){return `rgba(82,196,172,${0.08+0.62*t})`;}
419
+ return `rgba(29,122,107,${0.06+0.66*t})`;
420
+ }
421
+ function buildBoard(){
422
+ const maxD=ROWS[0][1];
423
+ let h=`<thead><tr><th class="enc">Encoder</th>`+AX.map(a=>`<th>${AXL[a]}</th>`).join('')+`<th>Δ&nbsp;readout</th></tr></thead><tbody>`;
424
+ ROWS.forEach((r,i)=>{
425
+ const [name,delta,cells,floor]=r;
426
+ h+=`<tr class="${floor?'floor':''}"><th class="enc"><span class="ranknum">${String(i+1).padStart(2,'0')}</span>${name}</th>`;
427
+ AX.forEach(a=>{const v=cells[a];h+=`<td><span class="cell" title="${name} · ${AXL[a]} = ${v.toFixed(2)}" style="display:inline-block;min-width:42px;padding:4px 0;background:${cellColor(v)}">${v.toFixed(2)}</span></td>`;});
428
+ h+=`<td class="delta">${delta.toFixed(3)}<span class="dbar" style="width:${28*delta/maxD}px"></span></td></tr>`;
429
+ });
430
+ h+=`</tbody>`;
431
+ document.getElementById('board-table').innerHTML=h;
432
+ }
433
+ buildBoard();
434
+ document.getElementById('lbnote').innerHTML='<b>How to read it.</b> Δ-readout leads with SONAR (0.762) but the gap to the field is small — second place is 0.046 behind, <b>inside the spread and within seed noise</b>. The interesting signal is the columns disagreeing: no encoder wins everywhere. mpnet &amp; gte top SAE-monosemanticity yet sit near the bottom on lexical readout; LaBSE leads lexical; SONAR leads order. That disagreement <b>is</b> finding&nbsp;01.';
435
+ /* recolour on theme change */
436
+ new MutationObserver(buildBoard).observe(document.documentElement,{attributes:true,attributeFilter:['data-theme']});
437
+ matchMedia('(prefers-color-scheme:dark)').addEventListener('change',buildBoard);
438
+ </script>
439
+ </body>
440
+ </html>