needle-playground / src /tokenizer.ts
shreyask's picture
Upload folder using huggingface_hub
814c07e verified
/**
* Needle SentencePiece tokenizer wrapper.
*
* Wraps the sentencepiece-js WASM library to expose a clean `Tokenizer` class
* whose `encode` / `decode` methods are byte-for-byte identical to Cactus's
* Python SentencePiece processor.
*
* Construction:
* - In Node / vitest: pass `fs.readFileSync(modelPath)` as `modelBytes`.
* - In browser: `fetch(modelUrl).then(r => r.arrayBuffer()).then(b => new Uint8Array(b))`.
*
* The model is a BPE SentencePiece protobuf with:
* - vocab_size = 8192
* - byte_fallback = true (unknown bytes β†’ <0xNN> pieces)
* - normalization_rule_name = "identity" (no NFC/NFKC)
* - user_defined_symbols = ["<tool_call>", "<tools>"] at IDs 4 and 5
*/
import { SentencePieceProcessor } from 'sentencepiece-js';
// Unique virtual path used to hand bytes through the fs-shim to sentencepiece-js.
const SP_VIRTUAL_PATH = '__needle_tokenizer__.model';
/**
* Determine at runtime whether we are running in Node.js (including vitest).
* We use (globalThis as any) to avoid TypeScript needing node type definitions
* in the browser-targeted tsconfig.
*/
function isNodeEnv(): boolean {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const g = globalThis as any;
return (
typeof g.process === 'object' &&
g.process !== null &&
typeof g.process.versions === 'object' &&
typeof g.process.versions.node === 'string'
);
}
export class Tokenizer {
private spp: SentencePieceProcessor;
private ready: Promise<void>;
/**
* @param modelBytes - Raw bytes of the `.model` SentencePiece protobuf file.
*/
constructor(modelBytes: Uint8Array) {
this.spp = new SentencePieceProcessor();
this.ready = isNodeEnv()
? this._loadNode(modelBytes)
: this._loadBrowser(modelBytes);
}
/**
* Node / vitest path: write bytes to a real temp file, load it, then clean up.
* Dynamic imports keep node: modules out of Vite's browser dependency graph.
*/
private async _loadNode(modelBytes: Uint8Array): Promise<void> {
// Dynamic imports so Vite never statically analyses these node: specifiers.
// The ts-ignore comments suppress TS2591 errors in the browser-targeted tsconfig
// (which does not include @types/node in its `types` array).
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore – node:fs not in browser tsconfig types
const { writeFileSync, unlinkSync, rmdirSync, mkdtempSync } = await import('node:fs');
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore – node:os not in browser tsconfig types
const { tmpdir } = await import('node:os');
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore – node:path not in browser tsconfig types
const { join } = await import('node:path');
const dir = (mkdtempSync as Function)(join(tmpdir(), 'needle-sp-'));
const modelPath = join(dir, 'tokenizer.model');
try {
(writeFileSync as Function)(modelPath, modelBytes);
await this.spp.load(modelPath);
} finally {
try { (unlinkSync as Function)(modelPath); } catch { /* ignore */ }
try { (rmdirSync as Function)(dir); } catch { /* ignore */ }
}
}
/**
* Browser path: register bytes under a virtual path in the fs-shim, then
* tell sentencepiece-js to "load" that path. Because Vite aliases `fs` β†’
* sp-fs-shim.ts, the `fs.readFileSync(virtualPath)` call inside
* sentencepiece-js is intercepted and returns our registered bytes.
*/
private async _loadBrowser(modelBytes: Uint8Array): Promise<void> {
const { registerBytes } = await import('./sp-fs-shim');
registerBytes(SP_VIRTUAL_PATH, modelBytes);
await this.spp.load(SP_VIRTUAL_PATH);
}
/** Wait for the model to finish loading before calling encode / decode. */
async init(): Promise<void> {
await this.ready;
}
/**
* Encode `text` into a list of token IDs using the Needle SP BPE model.
* Identical to `spp.encode_as_ids(text)` in Python.
*/
encode(text: string): number[] {
return this.spp.encodeIds(text);
}
/**
* Decode a list of token IDs back into a string.
* Identical to `spp.decode(ids)` in Python.
*/
decode(ids: number[]): string {
return this.spp.decodeIds(ids);
}
/**
* Encode `text` into piece strings (for debugging / inspection).
*/
encodePieces(text: string): string[] {
return this.spp.encodePieces(text);
}
}
/**
* Convenience factory: create a `Tokenizer` and wait for it to be ready.
*
* @param modelBytes - Raw bytes of the `.model` file.
*/
export async function createTokenizer(modelBytes: Uint8Array): Promise<Tokenizer> {
const t = new Tokenizer(modelBytes);
await t.init();
return t;
}