/** * Needle SentencePiece tokenizer wrapper. * * Wraps the sentencepiece-js WASM library to expose a clean `Tokenizer` class * whose `encode` / `decode` methods are byte-for-byte identical to Cactus's * Python SentencePiece processor. * * Construction: * - In Node / vitest: pass `fs.readFileSync(modelPath)` as `modelBytes`. * - In browser: `fetch(modelUrl).then(r => r.arrayBuffer()).then(b => new Uint8Array(b))`. * * The model is a BPE SentencePiece protobuf with: * - vocab_size = 8192 * - byte_fallback = true (unknown bytes → <0xNN> pieces) * - normalization_rule_name = "identity" (no NFC/NFKC) * - user_defined_symbols = ["", ""] at IDs 4 and 5 */ import { SentencePieceProcessor } from 'sentencepiece-js'; // Unique virtual path used to hand bytes through the fs-shim to sentencepiece-js. const SP_VIRTUAL_PATH = '__needle_tokenizer__.model'; /** * Determine at runtime whether we are running in Node.js (including vitest). * We use (globalThis as any) to avoid TypeScript needing node type definitions * in the browser-targeted tsconfig. */ function isNodeEnv(): boolean { // eslint-disable-next-line @typescript-eslint/no-explicit-any const g = globalThis as any; return ( typeof g.process === 'object' && g.process !== null && typeof g.process.versions === 'object' && typeof g.process.versions.node === 'string' ); } export class Tokenizer { private spp: SentencePieceProcessor; private ready: Promise; /** * @param modelBytes - Raw bytes of the `.model` SentencePiece protobuf file. */ constructor(modelBytes: Uint8Array) { this.spp = new SentencePieceProcessor(); this.ready = isNodeEnv() ? this._loadNode(modelBytes) : this._loadBrowser(modelBytes); } /** * Node / vitest path: write bytes to a real temp file, load it, then clean up. * Dynamic imports keep node: modules out of Vite's browser dependency graph. */ private async _loadNode(modelBytes: Uint8Array): Promise { // Dynamic imports so Vite never statically analyses these node: specifiers. // The ts-ignore comments suppress TS2591 errors in the browser-targeted tsconfig // (which does not include @types/node in its `types` array). // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore – node:fs not in browser tsconfig types const { writeFileSync, unlinkSync, rmdirSync, mkdtempSync } = await import('node:fs'); // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore – node:os not in browser tsconfig types const { tmpdir } = await import('node:os'); // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore – node:path not in browser tsconfig types const { join } = await import('node:path'); const dir = (mkdtempSync as Function)(join(tmpdir(), 'needle-sp-')); const modelPath = join(dir, 'tokenizer.model'); try { (writeFileSync as Function)(modelPath, modelBytes); await this.spp.load(modelPath); } finally { try { (unlinkSync as Function)(modelPath); } catch { /* ignore */ } try { (rmdirSync as Function)(dir); } catch { /* ignore */ } } } /** * Browser path: register bytes under a virtual path in the fs-shim, then * tell sentencepiece-js to "load" that path. Because Vite aliases `fs` → * sp-fs-shim.ts, the `fs.readFileSync(virtualPath)` call inside * sentencepiece-js is intercepted and returns our registered bytes. */ private async _loadBrowser(modelBytes: Uint8Array): Promise { const { registerBytes } = await import('./sp-fs-shim'); registerBytes(SP_VIRTUAL_PATH, modelBytes); await this.spp.load(SP_VIRTUAL_PATH); } /** Wait for the model to finish loading before calling encode / decode. */ async init(): Promise { await this.ready; } /** * Encode `text` into a list of token IDs using the Needle SP BPE model. * Identical to `spp.encode_as_ids(text)` in Python. */ encode(text: string): number[] { return this.spp.encodeIds(text); } /** * Decode a list of token IDs back into a string. * Identical to `spp.decode(ids)` in Python. */ decode(ids: number[]): string { return this.spp.decodeIds(ids); } /** * Encode `text` into piece strings (for debugging / inspection). */ encodePieces(text: string): string[] { return this.spp.encodePieces(text); } } /** * Convenience factory: create a `Tokenizer` and wait for it to be ready. * * @param modelBytes - Raw bytes of the `.model` file. */ export async function createTokenizer(modelBytes: Uint8Array): Promise { const t = new Tokenizer(modelBytes); await t.init(); return t; }