Spaces:
Running
Running
| /** | |
| * Needle SentencePiece tokenizer wrapper. | |
| * | |
| * Wraps the sentencepiece-js WASM library to expose a clean `Tokenizer` class | |
| * whose `encode` / `decode` methods are byte-for-byte identical to Cactus's | |
| * Python SentencePiece processor. | |
| * | |
| * Construction: | |
| * - In Node / vitest: pass `fs.readFileSync(modelPath)` as `modelBytes`. | |
| * - In browser: `fetch(modelUrl).then(r => r.arrayBuffer()).then(b => new Uint8Array(b))`. | |
| * | |
| * The model is a BPE SentencePiece protobuf with: | |
| * - vocab_size = 8192 | |
| * - byte_fallback = true (unknown bytes β <0xNN> pieces) | |
| * - normalization_rule_name = "identity" (no NFC/NFKC) | |
| * - user_defined_symbols = ["<tool_call>", "<tools>"] at IDs 4 and 5 | |
| */ | |
| import { SentencePieceProcessor } from 'sentencepiece-js'; | |
| // Unique virtual path used to hand bytes through the fs-shim to sentencepiece-js. | |
| const SP_VIRTUAL_PATH = '__needle_tokenizer__.model'; | |
| /** | |
| * Determine at runtime whether we are running in Node.js (including vitest). | |
| * We use (globalThis as any) to avoid TypeScript needing node type definitions | |
| * in the browser-targeted tsconfig. | |
| */ | |
| function isNodeEnv(): boolean { | |
| // eslint-disable-next-line @typescript-eslint/no-explicit-any | |
| const g = globalThis as any; | |
| return ( | |
| typeof g.process === 'object' && | |
| g.process !== null && | |
| typeof g.process.versions === 'object' && | |
| typeof g.process.versions.node === 'string' | |
| ); | |
| } | |
| export class Tokenizer { | |
| private spp: SentencePieceProcessor; | |
| private ready: Promise<void>; | |
| /** | |
| * @param modelBytes - Raw bytes of the `.model` SentencePiece protobuf file. | |
| */ | |
| constructor(modelBytes: Uint8Array) { | |
| this.spp = new SentencePieceProcessor(); | |
| this.ready = isNodeEnv() | |
| ? this._loadNode(modelBytes) | |
| : this._loadBrowser(modelBytes); | |
| } | |
| /** | |
| * Node / vitest path: write bytes to a real temp file, load it, then clean up. | |
| * Dynamic imports keep node: modules out of Vite's browser dependency graph. | |
| */ | |
| private async _loadNode(modelBytes: Uint8Array): Promise<void> { | |
| // Dynamic imports so Vite never statically analyses these node: specifiers. | |
| // The ts-ignore comments suppress TS2591 errors in the browser-targeted tsconfig | |
| // (which does not include @types/node in its `types` array). | |
| // eslint-disable-next-line @typescript-eslint/ban-ts-comment | |
| // @ts-ignore β node:fs not in browser tsconfig types | |
| const { writeFileSync, unlinkSync, rmdirSync, mkdtempSync } = await import('node:fs'); | |
| // eslint-disable-next-line @typescript-eslint/ban-ts-comment | |
| // @ts-ignore β node:os not in browser tsconfig types | |
| const { tmpdir } = await import('node:os'); | |
| // eslint-disable-next-line @typescript-eslint/ban-ts-comment | |
| // @ts-ignore β node:path not in browser tsconfig types | |
| const { join } = await import('node:path'); | |
| const dir = (mkdtempSync as Function)(join(tmpdir(), 'needle-sp-')); | |
| const modelPath = join(dir, 'tokenizer.model'); | |
| try { | |
| (writeFileSync as Function)(modelPath, modelBytes); | |
| await this.spp.load(modelPath); | |
| } finally { | |
| try { (unlinkSync as Function)(modelPath); } catch { /* ignore */ } | |
| try { (rmdirSync as Function)(dir); } catch { /* ignore */ } | |
| } | |
| } | |
| /** | |
| * Browser path: register bytes under a virtual path in the fs-shim, then | |
| * tell sentencepiece-js to "load" that path. Because Vite aliases `fs` β | |
| * sp-fs-shim.ts, the `fs.readFileSync(virtualPath)` call inside | |
| * sentencepiece-js is intercepted and returns our registered bytes. | |
| */ | |
| private async _loadBrowser(modelBytes: Uint8Array): Promise<void> { | |
| const { registerBytes } = await import('./sp-fs-shim'); | |
| registerBytes(SP_VIRTUAL_PATH, modelBytes); | |
| await this.spp.load(SP_VIRTUAL_PATH); | |
| } | |
| /** Wait for the model to finish loading before calling encode / decode. */ | |
| async init(): Promise<void> { | |
| await this.ready; | |
| } | |
| /** | |
| * Encode `text` into a list of token IDs using the Needle SP BPE model. | |
| * Identical to `spp.encode_as_ids(text)` in Python. | |
| */ | |
| encode(text: string): number[] { | |
| return this.spp.encodeIds(text); | |
| } | |
| /** | |
| * Decode a list of token IDs back into a string. | |
| * Identical to `spp.decode(ids)` in Python. | |
| */ | |
| decode(ids: number[]): string { | |
| return this.spp.decodeIds(ids); | |
| } | |
| /** | |
| * Encode `text` into piece strings (for debugging / inspection). | |
| */ | |
| encodePieces(text: string): string[] { | |
| return this.spp.encodePieces(text); | |
| } | |
| } | |
| /** | |
| * Convenience factory: create a `Tokenizer` and wait for it to be ready. | |
| * | |
| * @param modelBytes - Raw bytes of the `.model` file. | |
| */ | |
| export async function createTokenizer(modelBytes: Uint8Array): Promise<Tokenizer> { | |
| const t = new Tokenizer(modelBytes); | |
| await t.init(); | |
| return t; | |
| } | |