Spaces:
Running
Running
Update index.html
Browse files- index.html +40 -22
index.html
CHANGED
|
@@ -30,6 +30,10 @@
|
|
| 30 |
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
|
| 31 |
.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
|
| 32 |
.chip.active{outline:2px solid var(--accent)}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
|
| 34 |
.caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
|
| 35 |
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
|
|
@@ -148,36 +152,49 @@
|
|
| 148 |
|
| 149 |
status('Tokenizing…');
|
| 150 |
try {
|
| 151 |
-
const enc = await tokenizer.encode(text); //
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
let ids = Array.isArray(enc)
|
| 155 |
? enc
|
| 156 |
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
|
| 157 |
|
| 158 |
-
//
|
| 159 |
-
const
|
| 160 |
-
const
|
|
|
|
| 161 |
|
| 162 |
-
//
|
| 163 |
let tokens = [];
|
| 164 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
| 165 |
-
tokens = tokenizer.convert_ids_to_tokens(
|
| 166 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
| 167 |
-
tokens =
|
| 168 |
-
} else if (!Array.isArray(enc)) {
|
| 169 |
-
|
| 170 |
-
const encTokens = Array.isArray(enc.tokens) ? enc.tokens : [];
|
| 171 |
-
tokens = encTokens.filter((_, i) => !specials.has(ids[i]));
|
| 172 |
} else {
|
| 173 |
-
//
|
| 174 |
-
tokens =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
}
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
if (myRun !== runId) return;
|
| 178 |
|
| 179 |
state.tokens = tokens;
|
| 180 |
-
state.ids =
|
| 181 |
render();
|
| 182 |
status(`Done. ${state.tokens.length} tokens.`);
|
| 183 |
} catch (e) {
|
|
@@ -190,25 +207,26 @@
|
|
| 190 |
function render(){
|
| 191 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|
| 192 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
| 193 |
-
|
| 194 |
-
|
|
|
|
| 195 |
tokensEl.innerHTML = '';
|
| 196 |
tokens.forEach((tok, i) => {
|
| 197 |
const chip = document.createElement('span');
|
| 198 |
chip.className = 'chip';
|
|
|
|
| 199 |
chip.dataset.i = i;
|
| 200 |
chip.textContent = tok;
|
| 201 |
chip.addEventListener('mouseenter', ()=>highlight(i, true));
|
| 202 |
chip.addEventListener('mouseleave', ()=>highlight(i, false));
|
| 203 |
tokensEl.appendChild(chip);
|
| 204 |
});
|
| 205 |
-
|
| 206 |
-
// IDs pane
|
| 207 |
idsEl.textContent = ids.join(' ');
|
| 208 |
-
|
| 209 |
if (tokens.length === 0) status('Type to tokenize…');
|
| 210 |
}
|
| 211 |
|
|
|
|
| 212 |
function highlight(i, on){
|
| 213 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
| 214 |
if (!ids.length) return;
|
|
|
|
| 30 |
.chip{border:1px solid var(--chip-border);background:var(--chip);padding:.35rem .5rem;border-radius:10px;font-family:var(--mono);font-size:.9rem;transition:background .12s,border-color .12s}
|
| 31 |
.chip:hover{background:var(--chip-hover);border-color:var(--accent)}
|
| 32 |
.chip.active{outline:2px solid var(--accent)}
|
| 33 |
+
.chip.special {
|
| 34 |
+
border-color: #38bdf8;
|
| 35 |
+
background: #0b2235;
|
| 36 |
+
}
|
| 37 |
pre.ids{font-family:var(--mono);background:#0a1220;border:1px solid var(--border);border-radius:10px;padding:.75rem;max-height:360px;overflow:auto;white-space:pre-wrap}
|
| 38 |
.caption{color:var(--muted);font-size:.9rem;margin-top:.5rem}
|
| 39 |
footer{color:var(--muted);text-align:center;padding:1.25rem 0 2rem}
|
|
|
|
| 152 |
|
| 153 |
status('Tokenizing…');
|
| 154 |
try {
|
| 155 |
+
const enc = await tokenizer.encode(text); // include specials (default)
|
| 156 |
+
// Handle both array/object return shapes
|
| 157 |
+
const ids = Array.isArray(enc)
|
|
|
|
| 158 |
? enc
|
| 159 |
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
|
| 160 |
|
| 161 |
+
// Map special IDs -> special token strings (if available)
|
| 162 |
+
const specialIds = Array.from(tokenizer.all_special_ids || []);
|
| 163 |
+
const specialTokens = Array.from(tokenizer.all_special_tokens || []);
|
| 164 |
+
const idToSpecial = new Map(specialIds.map((id, i) => [id, specialTokens[i]]));
|
| 165 |
|
| 166 |
+
// Build token strings for every ID (specials included)
|
| 167 |
let tokens = [];
|
| 168 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
| 169 |
+
tokens = tokenizer.convert_ids_to_tokens(ids);
|
| 170 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
| 171 |
+
tokens = ids.map(id => tokenizer.id_to_token(id));
|
| 172 |
+
} else if (!Array.isArray(enc) && Array.isArray(enc.tokens)) {
|
| 173 |
+
tokens = enc.tokens;
|
|
|
|
|
|
|
| 174 |
} else {
|
| 175 |
+
// Fallback: decode each ID as a single-piece token
|
| 176 |
+
tokens = ids.map(id =>
|
| 177 |
+
tokenizer.decode([id], {
|
| 178 |
+
// we WANT specials in the stream; decode may return "" for them
|
| 179 |
+
skip_special_tokens: false,
|
| 180 |
+
clean_up_tokenization_spaces: false,
|
| 181 |
+
})
|
| 182 |
+
);
|
| 183 |
}
|
| 184 |
|
| 185 |
+
// Ensure specials are visible: if a special token decodes to empty,
|
| 186 |
+
// replace it with its canonical name or a generic tag.
|
| 187 |
+
tokens = tokens.map((tok, i) => {
|
| 188 |
+
const id = ids[i];
|
| 189 |
+
if (tok && tok.length) return tok;
|
| 190 |
+
if (idToSpecial.has(id)) return idToSpecial.get(id); // e.g., <|endoftext|> for GPT-2
|
| 191 |
+
return `<special:${id}>`;
|
| 192 |
+
});
|
| 193 |
+
|
| 194 |
if (myRun !== runId) return;
|
| 195 |
|
| 196 |
state.tokens = tokens;
|
| 197 |
+
state.ids = ids; // include specials in the count
|
| 198 |
render();
|
| 199 |
status(`Done. ${state.tokens.length} tokens.`);
|
| 200 |
} catch (e) {
|
|
|
|
| 207 |
function render(){
|
| 208 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|
| 209 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
| 210 |
+
|
| 211 |
+
const specialSet = new Set(tokenizer.all_special_ids || []);
|
| 212 |
+
|
| 213 |
tokensEl.innerHTML = '';
|
| 214 |
tokens.forEach((tok, i) => {
|
| 215 |
const chip = document.createElement('span');
|
| 216 |
chip.className = 'chip';
|
| 217 |
+
if (specialSet.has(ids[i])) chip.classList.add('special'); // <-- highlight specials
|
| 218 |
chip.dataset.i = i;
|
| 219 |
chip.textContent = tok;
|
| 220 |
chip.addEventListener('mouseenter', ()=>highlight(i, true));
|
| 221 |
chip.addEventListener('mouseleave', ()=>highlight(i, false));
|
| 222 |
tokensEl.appendChild(chip);
|
| 223 |
});
|
| 224 |
+
|
|
|
|
| 225 |
idsEl.textContent = ids.join(' ');
|
|
|
|
| 226 |
if (tokens.length === 0) status('Type to tokenize…');
|
| 227 |
}
|
| 228 |
|
| 229 |
+
|
| 230 |
function highlight(i, on){
|
| 231 |
const ids = Array.isArray(state.ids) ? state.ids : [];
|
| 232 |
if (!ids.length) return;
|