Upload sieve_bench/site/index.html with huggingface_hub
Browse files- sieve_bench/site/index.html +440 -0
sieve_bench/site/index.html
ADDED
|
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 6 |
+
<title>SIEVE — what a sentence embedding will and won't tell you</title>
|
| 7 |
+
<meta name="description" content="SIEVE is a runnable benchmark that scores how interpretable a sentence-embedding space is — what you can read, decompose, build, and edit in a frozen vector — across 26 tasks and 9 encoders, with a confound audit that refuses tasks a bag-of-words already solves.">
|
| 8 |
+
<link rel="preconnect" href="https://fonts.googleapis.com">
|
| 9 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
| 10 |
+
<link href="https://fonts.googleapis.com/css2?family=Fraunces:opsz,wght@9..144,400;9..144,500;9..144,600;9..144,800&family=Inter:wght@400;500;600&family=IBM+Plex+Mono:wght@400;500;600&display=swap" rel="stylesheet">
|
| 11 |
+
<style>
|
| 12 |
+
:root{
|
| 13 |
+
--bg:#f6f3ec; --bg2:#efeae0; --panel:#fffdf8; --ink:#23201b; --ink2:#5c554a; --ink3:#8c8475;
|
| 14 |
+
--line:#e1dacb; --line2:#d3cab6;
|
| 15 |
+
--pass:#1d7a6b; /* passes the sieve / readable */
|
| 16 |
+
--pass-soft:#dcefe9;
|
| 17 |
+
--diag:#b4632a; /* diagnostic / caught by the sieve */
|
| 18 |
+
--diag-soft:#f3e4d4;
|
| 19 |
+
--accent:#1d7a6b;
|
| 20 |
+
--shadow:0 1px 2px rgba(40,34,22,.05),0 8px 28px -12px rgba(40,34,22,.16);
|
| 21 |
+
--mono:'IBM Plex Mono',ui-monospace,Menlo,monospace;
|
| 22 |
+
--serif:'Fraunces',Georgia,serif;
|
| 23 |
+
--sans:'Inter',system-ui,-apple-system,sans-serif;
|
| 24 |
+
}
|
| 25 |
+
@media (prefers-color-scheme:dark){:root:not([data-theme="light"]){
|
| 26 |
+
--bg:#15161a; --bg2:#191b20; --panel:#1d1f25; --ink:#e9e4da; --ink2:#a9a294; --ink3:#7d7768;
|
| 27 |
+
--line:#2a2d34; --line2:#363a43;
|
| 28 |
+
--pass:#52c4ac; --pass-soft:#15302b; --diag:#e09a5e; --diag-soft:#34261a; --accent:#52c4ac;
|
| 29 |
+
--shadow:0 1px 2px rgba(0,0,0,.3),0 12px 34px -14px rgba(0,0,0,.6);
|
| 30 |
+
}}
|
| 31 |
+
:root[data-theme="dark"]{
|
| 32 |
+
--bg:#15161a; --bg2:#191b20; --panel:#1d1f25; --ink:#e9e4da; --ink2:#a9a294; --ink3:#7d7768;
|
| 33 |
+
--line:#2a2d34; --line2:#363a43;
|
| 34 |
+
--pass:#52c4ac; --pass-soft:#15302b; --diag:#e09a5e; --diag-soft:#34261a; --accent:#52c4ac;
|
| 35 |
+
--shadow:0 1px 2px rgba(0,0,0,.3),0 12px 34px -14px rgba(0,0,0,.6);
|
| 36 |
+
}
|
| 37 |
+
*{box-sizing:border-box}
|
| 38 |
+
html{scroll-behavior:smooth}
|
| 39 |
+
@media (prefers-reduced-motion:reduce){html{scroll-behavior:auto}*{transition:none!important;animation:none!important}}
|
| 40 |
+
body{margin:0;background:var(--bg);color:var(--ink);font-family:var(--sans);font-size:17px;line-height:1.65;-webkit-font-smoothing:antialiased;text-rendering:optimizeLegibility}
|
| 41 |
+
.wrap{max-width:1080px;margin:0 auto;padding:0 24px}
|
| 42 |
+
a{color:var(--accent);text-decoration:none}
|
| 43 |
+
a:hover{text-decoration:underline;text-underline-offset:3px}
|
| 44 |
+
h1,h2,h3{font-family:var(--serif);font-weight:600;letter-spacing:-.01em;line-height:1.12;color:var(--ink)}
|
| 45 |
+
code,.mono{font-family:var(--mono)}
|
| 46 |
+
.prose{max-width:68ch}
|
| 47 |
+
|
| 48 |
+
/* nav */
|
| 49 |
+
nav{position:sticky;top:0;z-index:50;background:color-mix(in srgb,var(--bg) 86%,transparent);backdrop-filter:blur(10px);-webkit-backdrop-filter:blur(10px);border-bottom:1px solid var(--line)}
|
| 50 |
+
nav .wrap{display:flex;align-items:center;gap:22px;height:56px}
|
| 51 |
+
.brand{font-family:var(--serif);font-weight:600;font-size:19px;letter-spacing:.02em;margin-right:auto;display:flex;align-items:center;gap:10px}
|
| 52 |
+
.brand .mesh{width:18px;height:18px;flex:none}
|
| 53 |
+
nav a.nl{color:var(--ink2);font-size:14px;font-weight:500}
|
| 54 |
+
nav a.nl:hover{color:var(--ink);text-decoration:none}
|
| 55 |
+
.tg{background:none;border:1px solid var(--line2);color:var(--ink2);border-radius:999px;padding:5px 12px;font-size:12px;font-family:var(--mono);cursor:pointer;transition:.25s cubic-bezier(.16,1,.3,1)}
|
| 56 |
+
.tg:hover{color:var(--ink);border-color:var(--ink3)}
|
| 57 |
+
.tg:focus-visible{outline:2px solid var(--accent);outline-offset:2px}
|
| 58 |
+
@media(max-width:680px){nav a.nl{display:none}}
|
| 59 |
+
|
| 60 |
+
/* hero */
|
| 61 |
+
header{padding:72px 0 40px;border-bottom:1px solid var(--line)}
|
| 62 |
+
.kic{font-family:var(--mono);font-size:12.5px;letter-spacing:.16em;text-transform:uppercase;color:var(--pass);font-weight:500;display:flex;align-items:center;gap:9px;margin-bottom:22px}
|
| 63 |
+
.kic::before{content:"";width:26px;height:1px;background:var(--pass)}
|
| 64 |
+
h1.title{font-size:clamp(2.6rem,6vw,4.4rem);font-weight:800;margin:0 0 4px;line-height:1}
|
| 65 |
+
h1.title .expand{display:block;font-size:clamp(1rem,2.2vw,1.35rem);font-weight:500;color:var(--ink2);font-family:var(--sans);letter-spacing:0;margin-top:18px;max-width:30ch}
|
| 66 |
+
.lede{font-size:clamp(1.12rem,2vw,1.32rem);line-height:1.5;color:var(--ink);max-width:46ch;margin:30px 0 0;font-weight:400}
|
| 67 |
+
.lede b{font-weight:600;color:var(--ink)}
|
| 68 |
+
.herofacts{display:flex;flex-wrap:wrap;gap:0;margin-top:40px;border:1px solid var(--line);border-radius:14px;overflow:hidden;background:var(--panel);box-shadow:var(--shadow)}
|
| 69 |
+
.hf{flex:1;min-width:128px;padding:18px 20px;border-right:1px solid var(--line)}
|
| 70 |
+
.hf:last-child{border-right:none}
|
| 71 |
+
.hf .n{font-family:var(--serif);font-weight:600;font-size:1.9rem;line-height:1;color:var(--ink)}
|
| 72 |
+
.hf .l{font-size:12.5px;color:var(--ink2);margin-top:7px;line-height:1.35}
|
| 73 |
+
@media(max-width:560px){.hf{flex:1 1 40%;border-bottom:1px solid var(--line)}}
|
| 74 |
+
|
| 75 |
+
section{padding:62px 0;border-bottom:1px solid var(--line)}
|
| 76 |
+
.eyebrow{font-family:var(--mono);font-size:12px;letter-spacing:.15em;text-transform:uppercase;color:var(--ink3);font-weight:500;margin:0 0 14px}
|
| 77 |
+
h2{font-size:clamp(1.7rem,3.2vw,2.3rem);margin:0 0 8px}
|
| 78 |
+
.sub{color:var(--ink2);font-size:1.05rem;max-width:62ch;margin:0 0 30px}
|
| 79 |
+
p{margin:0 0 18px}
|
| 80 |
+
.prose p{color:var(--ink);}
|
| 81 |
+
.prose p.muted{color:var(--ink2)}
|
| 82 |
+
|
| 83 |
+
/* the core idea — two cards */
|
| 84 |
+
.split{display:grid;grid-template-columns:1fr 1fr;gap:18px}
|
| 85 |
+
@media(max-width:720px){.split{grid-template-columns:1fr}}
|
| 86 |
+
.idea{background:var(--panel);border:1px solid var(--line);border-radius:14px;padding:26px;box-shadow:var(--shadow)}
|
| 87 |
+
.idea .q{font-family:var(--serif);font-size:1.25rem;color:var(--ink);font-weight:600;margin-bottom:10px}
|
| 88 |
+
.idea.usual{opacity:.92}
|
| 89 |
+
.idea .tag{font-family:var(--mono);font-size:11px;letter-spacing:.1em;text-transform:uppercase;color:var(--ink3);margin-bottom:14px;display:block}
|
| 90 |
+
.idea.sieve{border-color:var(--pass);box-shadow:0 0 0 1px var(--pass),var(--shadow)}
|
| 91 |
+
.idea.sieve .tag{color:var(--pass)}
|
| 92 |
+
|
| 93 |
+
/* families */
|
| 94 |
+
.fam{border:1px solid var(--line);border-radius:13px;overflow:hidden;margin-bottom:14px;background:var(--panel)}
|
| 95 |
+
.fam>summary{list-style:none;cursor:pointer;padding:18px 22px;display:flex;align-items:center;gap:16px}
|
| 96 |
+
.fam>summary::-webkit-details-marker{display:none}
|
| 97 |
+
.fam .letter{font-family:var(--serif);font-weight:800;font-size:1.5rem;width:38px;height:38px;flex:none;display:grid;place-items:center;border-radius:9px;background:var(--bg2);color:var(--pass)}
|
| 98 |
+
.fam .fname{font-family:var(--serif);font-weight:600;font-size:1.18rem}
|
| 99 |
+
.fam .fdesc{color:var(--ink2);font-size:.92rem;margin-top:2px}
|
| 100 |
+
.fam .chev{margin-left:auto;color:var(--ink3);transition:transform .3s cubic-bezier(.16,1,.3,1);font-family:var(--mono)}
|
| 101 |
+
.fam[open] .chev{transform:rotate(90deg)}
|
| 102 |
+
.fam .tasks{padding:2px 22px 18px}
|
| 103 |
+
.trow{display:flex;gap:12px;padding:9px 0;border-top:1px solid var(--line);align-items:baseline}
|
| 104 |
+
.trow .tid{font-family:var(--mono);font-size:12px;color:var(--ink3);width:38px;flex:none}
|
| 105 |
+
.trow .tnm{font-weight:500;color:var(--ink);min-width:200px}
|
| 106 |
+
.trow .ttx{color:var(--ink2);font-size:.92rem}
|
| 107 |
+
.pill{font-family:var(--mono);font-size:9.5px;letter-spacing:.06em;text-transform:uppercase;padding:2px 7px;border-radius:999px;font-weight:600;white-space:nowrap;align-self:center}
|
| 108 |
+
.pill.diag{background:var(--diag-soft);color:var(--diag)}
|
| 109 |
+
.pill.gen{background:var(--bg2);color:var(--ink3)}
|
| 110 |
+
@media(max-width:600px){.trow{flex-wrap:wrap}.trow .tnm{min-width:0}}
|
| 111 |
+
|
| 112 |
+
/* audit rules */
|
| 113 |
+
.rules{display:grid;grid-template-columns:repeat(3,1fr);gap:16px;margin-top:8px}
|
| 114 |
+
@media(max-width:720px){.rules{grid-template-columns:1fr}}
|
| 115 |
+
.rule{background:var(--panel);border:1px solid var(--line);border-radius:13px;padding:22px;box-shadow:var(--shadow)}
|
| 116 |
+
.rule .rn{font-family:var(--mono);font-size:12px;color:var(--diag);font-weight:600;letter-spacing:.08em;margin-bottom:10px}
|
| 117 |
+
.rule h3{font-size:1.08rem;margin:0 0 8px}
|
| 118 |
+
.rule p{font-size:.93rem;color:var(--ink2);margin:0}
|
| 119 |
+
.rule code{font-size:.82em;background:var(--bg2);padding:1px 5px;border-radius:4px;color:var(--ink)}
|
| 120 |
+
|
| 121 |
+
/* leaderboard / heatmap */
|
| 122 |
+
.lbwrap{background:var(--panel);border:1px solid var(--line);border-radius:14px;padding:8px;box-shadow:var(--shadow);overflow-x:auto}
|
| 123 |
+
table.heat{border-collapse:collapse;width:100%;font-size:13.5px;min-width:560px}
|
| 124 |
+
table.heat th,table.heat td{padding:8px 6px;text-align:center}
|
| 125 |
+
table.heat thead th{font-family:var(--mono);font-size:11px;letter-spacing:.04em;color:var(--ink2);font-weight:500;border-bottom:1px solid var(--line2);position:sticky;left:auto}
|
| 126 |
+
table.heat th.enc{text-align:left;font-family:var(--sans);font-weight:600;font-size:13.5px;color:var(--ink);white-space:nowrap;padding-left:14px}
|
| 127 |
+
table.heat td.delta{font-family:var(--mono);font-weight:600;font-size:14px;color:var(--ink);border-left:1px solid var(--line2)}
|
| 128 |
+
table.heat .cell{font-family:var(--mono);font-size:12.5px;color:#0c1410;border-radius:5px;cursor:default;font-weight:500}
|
| 129 |
+
:root[data-theme="dark"] table.heat .cell, @media(prefers-color-scheme:dark){table.heat .cell{color:#eafffa}}
|
| 130 |
+
table.heat tbody tr{border-bottom:1px solid var(--line)}
|
| 131 |
+
table.heat tbody tr:last-child{border-bottom:none}
|
| 132 |
+
table.heat tbody tr.floor th.enc{color:var(--diag)}
|
| 133 |
+
.ranknum{font-family:var(--mono);color:var(--ink3);font-size:11px;margin-right:8px}
|
| 134 |
+
.lbnote{font-size:12.5px;color:var(--ink2);margin-top:14px;font-family:var(--mono);line-height:1.6}
|
| 135 |
+
.lbnote b{color:var(--ink)}
|
| 136 |
+
.dbar{height:7px;border-radius:4px;background:var(--pass);display:inline-block;vertical-align:middle;margin-left:8px;opacity:.5}
|
| 137 |
+
|
| 138 |
+
/* findings */
|
| 139 |
+
.find{display:grid;grid-template-columns:1fr;gap:0}
|
| 140 |
+
.f{padding:24px 0;border-top:1px solid var(--line);display:grid;grid-template-columns:64px 1fr;gap:20px}
|
| 141 |
+
.f:first-child{border-top:none}
|
| 142 |
+
.f .fn{font-family:var(--serif);font-size:2.1rem;font-weight:800;color:var(--line2);line-height:1}
|
| 143 |
+
.f h3{font-size:1.2rem;margin:0 0 7px}
|
| 144 |
+
.f p{margin:0;color:var(--ink2);font-size:.97rem}
|
| 145 |
+
.f p b{color:var(--ink);font-weight:600}
|
| 146 |
+
.f .num{font-family:var(--mono);color:var(--pass);font-weight:600}
|
| 147 |
+
@media(max-width:560px){.f{grid-template-columns:1fr;gap:6px}.f .fn{font-size:1.4rem}}
|
| 148 |
+
|
| 149 |
+
/* quickstart */
|
| 150 |
+
.code{background:#15171c;border:1px solid #262a31;border-radius:12px;padding:20px 22px;font-family:var(--mono);font-size:13.5px;color:#cfd6dd;overflow-x:auto;line-height:1.85;position:relative}
|
| 151 |
+
.code .c{color:#6b7785}
|
| 152 |
+
.code .g{color:#7fd1bd}
|
| 153 |
+
.code .y{color:#d8b673}
|
| 154 |
+
.copy{position:absolute;top:12px;right:12px;background:#262a31;border:none;color:#9aa3ad;font-family:var(--mono);font-size:11px;padding:5px 10px;border-radius:6px;cursor:pointer;transition:.2s}
|
| 155 |
+
.copy:hover{background:#323843;color:#dfe4ea}
|
| 156 |
+
.steps{counter-reset:s;margin:0 0 26px;padding:0;list-style:none}
|
| 157 |
+
.steps li{counter-increment:s;padding:4px 0 4px 38px;position:relative;color:var(--ink2)}
|
| 158 |
+
.steps li::before{content:counter(s);position:absolute;left:0;top:2px;width:24px;height:24px;border-radius:50%;background:var(--bg2);border:1px solid var(--line2);color:var(--ink);font-family:var(--mono);font-size:12px;display:grid;place-items:center}
|
| 159 |
+
.steps li b{color:var(--ink)}
|
| 160 |
+
|
| 161 |
+
.callout{background:var(--pass-soft);border:1px solid color-mix(in srgb,var(--pass) 35%,transparent);border-radius:12px;padding:18px 22px;font-size:.96rem;color:var(--ink)}
|
| 162 |
+
.callout b{color:var(--pass)}
|
| 163 |
+
|
| 164 |
+
/* footer */
|
| 165 |
+
footer{padding:54px 0 80px;color:var(--ink2);font-size:14px}
|
| 166 |
+
footer .grid{display:flex;flex-wrap:wrap;gap:40px;margin-bottom:34px}
|
| 167 |
+
footer h4{font-family:var(--mono);font-size:11px;letter-spacing:.12em;text-transform:uppercase;color:var(--ink3);margin:0 0 12px;font-weight:500}
|
| 168 |
+
footer a{display:block;color:var(--ink2);margin-bottom:7px}
|
| 169 |
+
footer a:hover{color:var(--accent)}
|
| 170 |
+
.disc{border-top:1px solid var(--line);padding-top:24px;font-size:13px;color:var(--ink3);max-width:74ch;line-height:1.65}
|
| 171 |
+
.btn{display:inline-flex;align-items:center;gap:8px;background:var(--ink);color:var(--bg);padding:11px 20px;border-radius:999px;font-weight:600;font-size:14px;font-family:var(--sans);transition:.25s cubic-bezier(.16,1,.3,1)}
|
| 172 |
+
.btn:hover{text-decoration:none;transform:translateY(-1px);box-shadow:var(--shadow)}
|
| 173 |
+
.btn.alt{background:transparent;color:var(--ink);border:1px solid var(--line2)}
|
| 174 |
+
.btnrow{display:flex;gap:12px;flex-wrap:wrap;margin-top:30px}
|
| 175 |
+
.mesh path,.mesh line{stroke:var(--pass);stroke-width:1.2;fill:none}
|
| 176 |
+
</style>
|
| 177 |
+
</head>
|
| 178 |
+
<body>
|
| 179 |
+
|
| 180 |
+
<nav><div class="wrap">
|
| 181 |
+
<span class="brand">
|
| 182 |
+
<svg class="mesh" viewBox="0 0 20 20" aria-hidden="true"><path d="M2 5h16M2 10h16M2 15h16M5 2v16M10 2v16M15 2v16"/></svg>
|
| 183 |
+
SIEVE
|
| 184 |
+
</span>
|
| 185 |
+
<a class="nl" href="#idea">Idea</a>
|
| 186 |
+
<a class="nl" href="#measures">Tasks</a>
|
| 187 |
+
<a class="nl" href="#audit">Audit</a>
|
| 188 |
+
<a class="nl" href="#board">Leaderboard</a>
|
| 189 |
+
<a class="nl" href="#findings">Findings</a>
|
| 190 |
+
<a class="nl" href="#run">Run it</a>
|
| 191 |
+
<button class="tg" id="tg" aria-label="Toggle colour theme">auto</button>
|
| 192 |
+
</div></nav>
|
| 193 |
+
|
| 194 |
+
<header><div class="wrap">
|
| 195 |
+
<div class="kic">Sentence-embedding Interpretability EValuation</div>
|
| 196 |
+
<h1 class="title">SIEVE
|
| 197 |
+
<span class="expand">A benchmark for what a sentence embedding will, and won't, tell you.</span>
|
| 198 |
+
</h1>
|
| 199 |
+
<p class="lede">Most benchmarks ask whether a sentence embedding is <i>good</i> — does it retrieve, cluster, rank. SIEVE asks whether it is <b>legible</b>: given one frozen 1024-d vector, what can you <b>read</b> out of it, <b>decompose</b>, <b>build</b>, or <b>edit</b> — and what is simply <b>not there</b>. It runs on any encoder, on CPU, and it is built to refuse to fool you.</p>
|
| 200 |
+
<div class="herofacts">
|
| 201 |
+
<div class="hf"><div class="n">26</div><div class="l">tasks across 5 capability families</div></div>
|
| 202 |
+
<div class="hf"><div class="n">9</div><div class="l">encoders profiled (SONAR → GloVe-bag)</div></div>
|
| 203 |
+
<div class="hf"><div class="n">5</div><div class="l">live discriminating axes, audit-gated</div></div>
|
| 204 |
+
<div class="hf"><div class="n">CPU</div><div class="l">runnable on your own HF encoder</div></div>
|
| 205 |
+
</div>
|
| 206 |
+
</div></header>
|
| 207 |
+
|
| 208 |
+
<section id="idea"><div class="wrap">
|
| 209 |
+
<p class="eyebrow">The premise</p>
|
| 210 |
+
<h2>Legible is not the same as good</h2>
|
| 211 |
+
<p class="sub">A high score on a retrieval benchmark tells you a vector is <i>useful</i>. It says nothing about whether a human — or a safety monitor — can tell what is inside it. Those are different questions, and almost nothing measures the second one.</p>
|
| 212 |
+
<div class="split">
|
| 213 |
+
<div class="idea usual">
|
| 214 |
+
<span class="tag">What benchmarks usually ask</span>
|
| 215 |
+
<div class="q">"Is this embedding good?"</div>
|
| 216 |
+
<p class="muted" style="margin:0;color:var(--ink2);font-size:.96rem">Retrieval accuracy, clustering quality, STS correlation. Optimised for downstream utility. A perfectly opaque vector can top every one of them.</p>
|
| 217 |
+
</div>
|
| 218 |
+
<div class="idea sieve">
|
| 219 |
+
<span class="tag">What SIEVE asks</span>
|
| 220 |
+
<div class="q">"Is this embedding legible?"</div>
|
| 221 |
+
<p style="margin:0;color:var(--ink2);font-size:.96rem">Can you read the words, the numbers, the order? Decompose its structure? Build one from parts? Edit one sentence of three and leave the rest? And crucially — <b style="color:var(--pass)">where does that legibility run out?</b></p>
|
| 222 |
+
</div>
|
| 223 |
+
</div>
|
| 224 |
+
<p class="prose" style="margin-top:30px;color:var(--ink2)">The name is the method. A sieve sorts what passes through from what is caught. SIEVE separates what a sentence embedding genuinely carries in a readable form from what only <i>looks</i> readable because a bag-of-words baseline already solved it. The second half is the harder, more honest measurement — and it is where the interesting science lives.</p>
|
| 225 |
+
</div></section>
|
| 226 |
+
|
| 227 |
+
<section id="measures"><div class="wrap">
|
| 228 |
+
<p class="eyebrow">What it measures</p>
|
| 229 |
+
<h2>Five families, twenty-six tasks</h2>
|
| 230 |
+
<p class="sub">Every task is normalised to 0–1 and reported against a <b>baseline</b> (random / bag-of-words / surface-position) and a <b>ceiling</b> (oracle / full-vector). Tasks marked <span class="pill diag" style="display:inline-block">diag</span> are diagnostic — a <i>low</i> score is the finding, not a failure. <span class="pill gen" style="display:inline-block">gen</span> tasks need a decoder.</p>
|
| 231 |
+
<div id="families"></div>
|
| 232 |
+
</div></section>
|
| 233 |
+
|
| 234 |
+
<section id="audit"><div class="wrap">
|
| 235 |
+
<p class="eyebrow">Why you can trust it</p>
|
| 236 |
+
<h2>The audit: a benchmark that refuses to fool itself</h2>
|
| 237 |
+
<p class="sub">The hardest failure in interpretability is a high number that means nothing — a probe that "reads" a concept that a bag-of-words could read just as well. SIEVE bakes a confound check into the harness. Every task is <b>audited</b>, and the headline averages <b>only the tasks that pass</b>.</p>
|
| 238 |
+
<div class="rules">
|
| 239 |
+
<div class="rule">
|
| 240 |
+
<div class="rn">RULE 1 · SPECIFICITY</div>
|
| 241 |
+
<h3>Not z-specific → excluded</h3>
|
| 242 |
+
<p>If a <code>bag-of-words</code> or <code>surface-position</code> baseline already reaches the score, the task isn't measuring the <i>embedding</i> — a dumb null solved it. It cannot count toward "this vector is legible."</p>
|
| 243 |
+
</div>
|
| 244 |
+
<div class="rule">
|
| 245 |
+
<div class="rn">RULE 2 · HEADROOM</div>
|
| 246 |
+
<h3>Degenerate → excluded</h3>
|
| 247 |
+
<p>If the score barely clears its own baseline (<code>Δ<0.05</code>), the task is saturated or trivial. A pass with no headroom carries no ranking signal and is dropped from the comparison.</p>
|
| 248 |
+
</div>
|
| 249 |
+
<div class="rule">
|
| 250 |
+
<div class="rn">RULE 3 · AGREEMENT</div>
|
| 251 |
+
<h3>Disagreeing arms → excluded</h3>
|
| 252 |
+
<p>Tasks measured two ways (a probe arm and a decode arm) must agree to <code>±0.15</code>. A gap means one arm is buggy — so the number is suppressed until it's reconciled.</p>
|
| 253 |
+
</div>
|
| 254 |
+
</div>
|
| 255 |
+
<p class="prose" style="margin-top:28px;color:var(--ink2)">This is not theoretical. During development the audit caught the benchmark's own most-quotable headline — a "universal no-binding" result that was really a <i>broken task</i> scoring chance for everyone — and forced it to be rebuilt on natural data before it could ship. The two-track leaderboard below reports an apples-to-apples <b>intersection</b> of audit-passing tasks, never one inflated winner column.</p>
|
| 256 |
+
</div></section>
|
| 257 |
+
|
| 258 |
+
<section id="board"><div class="wrap">
|
| 259 |
+
<p class="eyebrow">The leaderboard</p>
|
| 260 |
+
<h2>Nine encoders, five live axes</h2>
|
| 261 |
+
<p class="sub">The fair comparative number — <b>Encode-Readout-Δ</b> — is the mean of <code>(score − baseline) / (ceiling − baseline)</code> over the five tasks that <i>every</i> encoder runs and audit-passes on the same probe-only arm. Hover a cell for the per-axis score. The static <b style="color:var(--diag)">GloVe-bag</b> is a literal bag-of-words — the floor, and a control that the additivity axis correctly flags.</p>
|
| 262 |
+
<div class="lbwrap">
|
| 263 |
+
<table class="heat" id="board-table"></table>
|
| 264 |
+
</div>
|
| 265 |
+
<p class="lbnote" id="lbnote"></p>
|
| 266 |
+
</div></section>
|
| 267 |
+
|
| 268 |
+
<section id="findings"><div class="wrap">
|
| 269 |
+
<p class="eyebrow">What it found</p>
|
| 270 |
+
<h2>Five honest findings</h2>
|
| 271 |
+
<p class="sub">The benchmark's first scientific use — a designed sweep of size, training objective, and architecture — produced results that were pre-registered before looking, and one claim it had to walk back through its own controls.</p>
|
| 272 |
+
<div class="find">
|
| 273 |
+
<div class="f"><div class="fn">01</div><div>
|
| 274 |
+
<h3>Interpretability is not one number</h3>
|
| 275 |
+
<p>Across the nine encoders the discriminating axes rank them <b>almost independently</b> — mean cross-axis rank-correlation <span class="num">+0.21</span>. Lexical-readout and feature-monosemanticity are even <b>anti-correlated</b> (<span class="num">−0.67</span>): the encoders best at reading words out are the <i>worst</i> at having monosemantic features. There is no single "interpretability score" — which is why SIEVE reports a profile, not a winner.</p>
|
| 276 |
+
</div></div>
|
| 277 |
+
<div class="f"><div class="fn">02</div><div>
|
| 278 |
+
<h3>No abstract role-binding — in any encoder</h3>
|
| 279 |
+
<p>Trained to read "who is the agent" on one sentence construction and tested on another, <b>every</b> encoder fails (cross-construction AUC <span class="num">0.47–0.60</span>, none above 0.70) while a surface-position baseline reads it perfectly (<span class="num">0.91</span>). Mean-pooled sentence embeddings carry <i>who</i> and <i>what</i> as content, but not <i>who-did-what-to-whom</i> as structure. Verified on 1,270 naturally-parsed sentences across five syntactic forms.</p>
|
| 280 |
+
</div></div>
|
| 281 |
+
<div class="f"><div class="fn">03</div><div>
|
| 282 |
+
<h3>Training objective reshapes <i>additivity</i>, not readout</h3>
|
| 283 |
+
<p>A masked-LM BERT and a contrastive retriever reach the <b>same</b> abstract-readout score — but contrastive training makes the pooled space far more of a literal bag-of-words. Objective controls how additive the geometry is, not how much abstract content is legible.</p>
|
| 284 |
+
</div></div>
|
| 285 |
+
<div class="f"><div class="fn">04</div><div>
|
| 286 |
+
<h3>Interpretability does not scale with size</h3>
|
| 287 |
+
<p>Across e5-small → base → large the readout score is <b>flat within seed noise</b> (<span class="num">σ≈0.016</span>, span 0.03, non-monotone). Bigger is not more legible — at least not within a family.</p>
|
| 288 |
+
</div></div>
|
| 289 |
+
<div class="f"><div class="fn">05</div><div>
|
| 290 |
+
<h3>The control behaves exactly as it should</h3>
|
| 291 |
+
<p>A literal static bag-of-GloVe-vectors is the <i>only</i> encoder flagged genuinely additive — order-sensitivity exactly <span class="num">0.000</span>, vector cosine to the word-mean <span class="num">0.985</span> — and sits at the readout floor. When your benchmark's planted negative control lands precisely where it must, the axis is calibrated. <span style="color:var(--ink3)">(And the over-claim it caught: an early "contrastive embeddings are just bags" headline was walked back once a shuffled-word + order-permutation control showed it was a length artifact.)</span></p>
|
| 292 |
+
</div></div>
|
| 293 |
+
</div>
|
| 294 |
+
</div></section>
|
| 295 |
+
|
| 296 |
+
<section id="run"><div class="wrap">
|
| 297 |
+
<p class="eyebrow">Run it</p>
|
| 298 |
+
<h2>Score your own encoder in three lines</h2>
|
| 299 |
+
<ol class="steps">
|
| 300 |
+
<li>Download <code>sieve_bench/</code> from the Hugging Face repo — no SONAR, no GPU required.</li>
|
| 301 |
+
<li><b>Point it at any model</b> via the <code>hf:<id>:<pool></code> adapter, or implement the three-method <code>Encoder</code> interface for your own.</li>
|
| 302 |
+
<li>Read the auto-generated two-track leaderboard with its audit column.</li>
|
| 303 |
+
</ol>
|
| 304 |
+
<div class="code">
|
| 305 |
+
<button class="copy" onclick="cp(this)">copy</button>
|
| 306 |
+
<span class="c"># a stranger with any HuggingFace sentence encoder, on CPU</span>
|
| 307 |
+
pip install -r requirements.txt
|
| 308 |
+
python run.py --encoder <span class="y">hf:sentence-transformers/all-MiniLM-L6-v2:mean</span> --tasks all
|
| 309 |
+
python make_leaderboard.py <span class="c"># → LEADERBOARD.md</span>
|
| 310 |
+
</div>
|
| 311 |
+
<div class="callout" style="margin-top:24px"><b>Decode-free by default.</b> The core readout, decomposition, and cross-lingual families run on any encoder that produces a vector. Construction and editing tasks need a decoder and skip cleanly when one isn't present — so the comparison stays fair across encoders that can't generate.</div>
|
| 312 |
+
</div></section>
|
| 313 |
+
|
| 314 |
+
<footer><div class="wrap">
|
| 315 |
+
<div class="grid">
|
| 316 |
+
<div>
|
| 317 |
+
<h4>Artifact</h4>
|
| 318 |
+
<a href="https://huggingface.co/nickypro/sonar-sae/tree/main/sieve_bench">Hugging Face · sieve_bench</a>
|
| 319 |
+
<a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/README.md">README & quickstart</a>
|
| 320 |
+
<a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/SIEVE_SPEC.md">Full design spec</a>
|
| 321 |
+
<a href="https://huggingface.co/nickypro/sonar-sae/blob/main/sieve_bench/LEADERBOARD.md">Raw leaderboard</a>
|
| 322 |
+
</div>
|
| 323 |
+
<div>
|
| 324 |
+
<h4>Design invariants</h4>
|
| 325 |
+
<a style="cursor:default">Baseline + ceiling + control, every task</a>
|
| 326 |
+
<a style="cursor:default">Audit-pass tasks only in the headline</a>
|
| 327 |
+
<a style="cursor:default">Diagnostic tasks: low is the finding</a>
|
| 328 |
+
<a style="cursor:default">Per-axis profile, never one winner</a>
|
| 329 |
+
</div>
|
| 330 |
+
<div>
|
| 331 |
+
<h4>The five families</h4>
|
| 332 |
+
<a style="cursor:default">A · Readout</a>
|
| 333 |
+
<a style="cursor:default">B · Decomposition</a>
|
| 334 |
+
<a style="cursor:default">C · Construction</a>
|
| 335 |
+
<a style="cursor:default">D · Editing</a>
|
| 336 |
+
<a style="cursor:default">E · Cross-lingual</a>
|
| 337 |
+
</div>
|
| 338 |
+
</div>
|
| 339 |
+
<div class="btnrow" style="margin-bottom:34px">
|
| 340 |
+
<a class="btn" href="https://huggingface.co/nickypro/sonar-sae/tree/main/sieve_bench">Get SIEVE →</a>
|
| 341 |
+
<a class="btn alt" href="#measures">See the tasks</a>
|
| 342 |
+
</div>
|
| 343 |
+
<p class="disc">This page and the SIEVE benchmark were generated by an AI research agent. Findings were produced by running code and reading the results, with pre-registered predictions and an automated confound audit; they have primarily been machine-verified and self-critiqued, not independently reproduced by a human. Numbers are read from the actual benchmark runs. "Legibility" here means probe / decode / reconstruction readout of a frozen embedding — not a claim about human perception. Reviewed adversarially by a second model during development; the benchmark walked back one over-claim and fixed one broken headline task through its own controls. Treat the comparative leaderboard as a research instrument, not a product ranking: several gaps sit within seed noise, as noted on the board.</p>
|
| 344 |
+
</div></footer>
|
| 345 |
+
|
| 346 |
+
<div style="position:fixed;top:10px;right:10px;background:rgba(0,0,0,0.45);color:rgba(255,255,255,0.7);font-size:9px;padding:2px 7px;border-radius:8px;z-index:9999;font-family:system-ui,-apple-system,sans-serif;pointer-events:none;backdrop-filter:blur(6px);-webkit-backdrop-filter:blur(6px);letter-spacing:0.3px;text-transform:uppercase;font-weight:500;">ai gen</div>
|
| 347 |
+
|
| 348 |
+
<script>
|
| 349 |
+
/* ---- theme toggle ---- */
|
| 350 |
+
const modes=['auto','light','dark'];
|
| 351 |
+
let mi=modes.indexOf(localStorage.getItem('theme')||'auto'); if(mi<0)mi=0;
|
| 352 |
+
const tgEl=document.getElementById('tg');
|
| 353 |
+
function applyTheme(){const m=modes[mi];if(m==='auto')document.documentElement.removeAttribute('data-theme');else document.documentElement.setAttribute('data-theme',m);tgEl.textContent=m;}
|
| 354 |
+
tgEl.addEventListener('click',()=>{mi=(mi+1)%3;localStorage.setItem('theme',modes[mi]);applyTheme();});
|
| 355 |
+
applyTheme();
|
| 356 |
+
|
| 357 |
+
/* ---- copy ---- */
|
| 358 |
+
function cp(b){const code=b.parentElement.innerText.replace(/^copy/,'').trim();navigator.clipboard.writeText(code);b.textContent='copied';setTimeout(()=>b.textContent='copy',1400);}
|
| 359 |
+
|
| 360 |
+
/* ---- families data ---- */
|
| 361 |
+
const FAM=[
|
| 362 |
+
{L:'A',n:'Readout',d:'read content out of a frozen vector',t:[
|
| 363 |
+
['t01','lexical bag','which content words are present',''],
|
| 364 |
+
['t02','number-exact','recover the exact numeric value',''],
|
| 365 |
+
['t03','entity-presence','is entity X present, tested across paraphrase',''],
|
| 366 |
+
['t04','negation-scope','negation present, and which clause it scopes',''],
|
| 367 |
+
['t05','position / order','recover word order from the vector',''],
|
| 368 |
+
['t06','thematic-role','is X the agent? — across constructions','diag'],
|
| 369 |
+
['t07','meaning-coverage','rebuild meaning from readable properties','gen'],
|
| 370 |
+
['t08','length-generalization','does a short-trained readout survive on long inputs',''],
|
| 371 |
+
['t09','coreference','does a pronoun refer to entity X','diag']]},
|
| 372 |
+
{L:'B',n:'Decomposition',d:'how the vector is organised',t:[
|
| 373 |
+
['t10','dimensionality','effective rank, intrinsic dimension of the manifold','diag'],
|
| 374 |
+
['t11','position un-rotation','can a token be separated from its position','diag'],
|
| 375 |
+
['t12','additivity','is the vector just a bag of word-vectors','diag'],
|
| 376 |
+
['t13','SAE-monosemanticity','do sparse features carry single clean concepts',''],
|
| 377 |
+
['t14','capacity-law','how recoverability decays with sentence length','']]},
|
| 378 |
+
{L:'C',n:'Construction',d:'build a vector from parts',t:[
|
| 379 |
+
['t15','sentence-from-words','construct a faithful vector from shuffled words','gen'],
|
| 380 |
+
['t16','vocab-coverage','single-word round-trip across a vocabulary','gen'],
|
| 381 |
+
['t17','recombination','is uniform pooling already near-optimal','diag']]},
|
| 382 |
+
{L:'D',n:'Editing',d:'change a vector precisely',t:[
|
| 383 |
+
['t18','concept-steer','add a concept direction — does it causally appear','diag'],
|
| 384 |
+
['t22','word-edit','replace X with Y, preserving the rest','gen'],
|
| 385 |
+
['t23','edit sentence 2 of 3','edit one clause, leave the others intact','gen'],
|
| 386 |
+
['t24','sentence-reorder','swap the order of encoded sentences','diag'],
|
| 387 |
+
['t25','concept-injection','inject content at the predicted capacity budget','gen'],
|
| 388 |
+
['t26','causal-identifiability','swap agent⇄patient while preserving content','diag']]},
|
| 389 |
+
{L:'E',n:'Cross-lingual',d:'across languages and encoders',t:[
|
| 390 |
+
['t19','cross-lingual readout','does a content probe transfer across languages',''],
|
| 391 |
+
['t20','decode-by-language','readout quality per language','gen'],
|
| 392 |
+
['t21','encoder generality','does the profile hold across pooling / encoders','']]}
|
| 393 |
+
];
|
| 394 |
+
const famHost=document.getElementById('families');
|
| 395 |
+
FAM.forEach(f=>{
|
| 396 |
+
const tasks=f.t.map(t=>`<div class="trow"><span class="tid">${t[0]}</span><span class="tnm">${t[1]}</span><span class="ttx">${t[2]}</span>${t[3]==='diag'?'<span class="pill diag">diag</span>':t[3]==='gen'?'<span class="pill gen">gen</span>':''}</div>`).join('');
|
| 397 |
+
famHost.insertAdjacentHTML('beforeend',`<details class="fam"${f.L==='A'?' open':''}><summary><span class="letter">${f.L}</span><span><span class="fname">${f.n}</span><div class="fdesc">${f.d} · ${f.t.length} tasks</div></span><span class="chev">›</span></summary><div class="tasks">${tasks}</div></details>`);
|
| 398 |
+
});
|
| 399 |
+
|
| 400 |
+
/* ---- leaderboard heatmap ---- */
|
| 401 |
+
const AX=['t01','t05','t08','t13','t19'];
|
| 402 |
+
const AXL={t01:'lexical',t05:'order',t08:'length-gen',t13:'SAE-mono',t19:'cross-ling'};
|
| 403 |
+
const ROWS=[
|
| 404 |
+
['SONAR',0.762,{t01:0.41,t05:0.84,t08:0.73,t13:0.83,t19:1.00},false],
|
| 405 |
+
['gte-large',0.719,{t01:0.31,t05:0.63,t08:0.70,t13:0.96,t19:1.00},false],
|
| 406 |
+
['mpnet',0.704,{t01:0.26,t05:0.66,t08:0.71,t13:0.96,t19:0.93},false],
|
| 407 |
+
['e5-base',0.690,{t01:0.38,t05:0.70,t08:0.56,t13:0.85,t19:0.96},false],
|
| 408 |
+
['LaBSE',0.671,{t01:0.50,t05:0.67,t08:0.53,t13:0.66,t19:1.00},false],
|
| 409 |
+
['e5-large',0.644,{t01:0.40,t05:0.53,t08:0.49,t13:0.80,t19:0.99},false],
|
| 410 |
+
['e5-small',0.644,{t01:0.40,t05:0.52,t08:0.51,t13:0.80,t19:0.98},false],
|
| 411 |
+
['bert-base',0.565,{t01:0.36,t05:0.80,t08:0.14,t13:0.91,t19:0.61},false],
|
| 412 |
+
['GloVe-bag',0.346,{t01:0.33,t05:0.09,t08:0.06,t13:0.74,t19:0.51},true]
|
| 413 |
+
];
|
| 414 |
+
function lerp(a,b,t){return a+(b-a)*t;}
|
| 415 |
+
function cellColor(v){ // v 0..1 → pale → pass colour
|
| 416 |
+
const t=Math.max(0,Math.min(1,v));
|
| 417 |
+
const dark=document.documentElement.getAttribute('data-theme')==='dark'|| (matchMedia('(prefers-color-scheme:dark)').matches && document.documentElement.getAttribute('data-theme')!=='light');
|
| 418 |
+
if(dark){return `rgba(82,196,172,${0.08+0.62*t})`;}
|
| 419 |
+
return `rgba(29,122,107,${0.06+0.66*t})`;
|
| 420 |
+
}
|
| 421 |
+
function buildBoard(){
|
| 422 |
+
const maxD=ROWS[0][1];
|
| 423 |
+
let h=`<thead><tr><th class="enc">Encoder</th>`+AX.map(a=>`<th>${AXL[a]}</th>`).join('')+`<th>Δ readout</th></tr></thead><tbody>`;
|
| 424 |
+
ROWS.forEach((r,i)=>{
|
| 425 |
+
const [name,delta,cells,floor]=r;
|
| 426 |
+
h+=`<tr class="${floor?'floor':''}"><th class="enc"><span class="ranknum">${String(i+1).padStart(2,'0')}</span>${name}</th>`;
|
| 427 |
+
AX.forEach(a=>{const v=cells[a];h+=`<td><span class="cell" title="${name} · ${AXL[a]} = ${v.toFixed(2)}" style="display:inline-block;min-width:42px;padding:4px 0;background:${cellColor(v)}">${v.toFixed(2)}</span></td>`;});
|
| 428 |
+
h+=`<td class="delta">${delta.toFixed(3)}<span class="dbar" style="width:${28*delta/maxD}px"></span></td></tr>`;
|
| 429 |
+
});
|
| 430 |
+
h+=`</tbody>`;
|
| 431 |
+
document.getElementById('board-table').innerHTML=h;
|
| 432 |
+
}
|
| 433 |
+
buildBoard();
|
| 434 |
+
document.getElementById('lbnote').innerHTML='<b>How to read it.</b> Δ-readout leads with SONAR (0.762) but the gap to the field is small — second place is 0.046 behind, <b>inside the spread and within seed noise</b>. The interesting signal is the columns disagreeing: no encoder wins everywhere. mpnet & gte top SAE-monosemanticity yet sit near the bottom on lexical readout; LaBSE leads lexical; SONAR leads order. That disagreement <b>is</b> finding 01.';
|
| 435 |
+
/* recolour on theme change */
|
| 436 |
+
new MutationObserver(buildBoard).observe(document.documentElement,{attributes:true,attributeFilter:['data-theme']});
|
| 437 |
+
matchMedia('(prefers-color-scheme:dark)').addEventListener('change',buildBoard);
|
| 438 |
+
</script>
|
| 439 |
+
</body>
|
| 440 |
+
</html>
|