Spaces:
Running
Running
Update index.html
Browse files- index.html +21 -21
index.html
CHANGED
|
@@ -148,36 +148,36 @@
|
|
| 148 |
|
| 149 |
status('Tokenizing…');
|
| 150 |
try {
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
// 2) Drop special tokens (e.g., BOS/EOS) for the demo
|
| 160 |
const specials = new Set(tokenizer.all_special_ids || []);
|
| 161 |
const idsNoSpecials = ids.filter(id => !specials.has(id));
|
| 162 |
-
|
| 163 |
-
//
|
| 164 |
let tokens = [];
|
| 165 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
| 166 |
tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
|
| 167 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
| 168 |
tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
|
| 169 |
-
} else if (Array.isArray(enc
|
| 170 |
-
//
|
| 171 |
-
|
|
|
|
| 172 |
} else {
|
|
|
|
| 173 |
tokens = idsNoSpecials.map(String);
|
| 174 |
}
|
| 175 |
-
|
| 176 |
-
if (myRun !== runId) return;
|
| 177 |
-
|
| 178 |
-
state.tokens =
|
| 179 |
-
state.ids
|
| 180 |
-
|
| 181 |
render();
|
| 182 |
status(`Done. ${state.tokens.length} tokens.`);
|
| 183 |
} catch (e) {
|
|
@@ -185,7 +185,7 @@
|
|
| 185 |
render();
|
| 186 |
status('Error tokenizing. See console.');
|
| 187 |
}
|
| 188 |
-
|
| 189 |
|
| 190 |
function render(){
|
| 191 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|
|
|
|
| 148 |
|
| 149 |
status('Tokenizing…');
|
| 150 |
try {
|
| 151 |
+
const enc = await tokenizer.encode(text); // returns EITHER an array OR an object, depending on tokenizer
|
| 152 |
+
|
| 153 |
+
// >>> handle both shapes
|
| 154 |
+
let ids = Array.isArray(enc)
|
| 155 |
+
? enc
|
| 156 |
+
: (enc && (enc.ids ?? enc.input_ids ?? enc.inputIds)) || [];
|
| 157 |
+
|
| 158 |
+
// Drop special tokens (for GPT-2, usually [50256])
|
|
|
|
| 159 |
const specials = new Set(tokenizer.all_special_ids || []);
|
| 160 |
const idsNoSpecials = ids.filter(id => !specials.has(id));
|
| 161 |
+
|
| 162 |
+
// Derive token strings from IDs
|
| 163 |
let tokens = [];
|
| 164 |
if (typeof tokenizer.convert_ids_to_tokens === 'function') {
|
| 165 |
tokens = tokenizer.convert_ids_to_tokens(idsNoSpecials);
|
| 166 |
} else if (typeof tokenizer.id_to_token === 'function') {
|
| 167 |
tokens = idsNoSpecials.map(id => tokenizer.id_to_token(id));
|
| 168 |
+
} else if (!Array.isArray(enc)) {
|
| 169 |
+
// Some builds expose enc.tokens when enc is an object
|
| 170 |
+
const encTokens = Array.isArray(enc.tokens) ? enc.tokens : [];
|
| 171 |
+
tokens = encTokens.filter((_, i) => !specials.has(ids[i]));
|
| 172 |
} else {
|
| 173 |
+
// Last resort: stringify IDs (shouldn’t be needed with GPT-2)
|
| 174 |
tokens = idsNoSpecials.map(String);
|
| 175 |
}
|
| 176 |
+
|
| 177 |
+
if (myRun !== runId) return;
|
| 178 |
+
|
| 179 |
+
state.tokens = tokens;
|
| 180 |
+
state.ids = idsNoSpecials;
|
|
|
|
| 181 |
render();
|
| 182 |
status(`Done. ${state.tokens.length} tokens.`);
|
| 183 |
} catch (e) {
|
|
|
|
| 185 |
render();
|
| 186 |
status('Error tokenizing. See console.');
|
| 187 |
}
|
| 188 |
+
|
| 189 |
|
| 190 |
function render(){
|
| 191 |
const tokens = Array.isArray(state.tokens) ? state.tokens : [];
|