Spaces:
Running
Running
File size: 4,773 Bytes
814c07e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | /**
* Needle SentencePiece tokenizer wrapper.
*
* Wraps the sentencepiece-js WASM library to expose a clean `Tokenizer` class
* whose `encode` / `decode` methods are byte-for-byte identical to Cactus's
* Python SentencePiece processor.
*
* Construction:
* - In Node / vitest: pass `fs.readFileSync(modelPath)` as `modelBytes`.
* - In browser: `fetch(modelUrl).then(r => r.arrayBuffer()).then(b => new Uint8Array(b))`.
*
* The model is a BPE SentencePiece protobuf with:
* - vocab_size = 8192
* - byte_fallback = true (unknown bytes → <0xNN> pieces)
* - normalization_rule_name = "identity" (no NFC/NFKC)
* - user_defined_symbols = ["<tool_call>", "<tools>"] at IDs 4 and 5
*/
import { SentencePieceProcessor } from 'sentencepiece-js';
// Unique virtual path used to hand bytes through the fs-shim to sentencepiece-js.
const SP_VIRTUAL_PATH = '__needle_tokenizer__.model';
/**
* Determine at runtime whether we are running in Node.js (including vitest).
* We use (globalThis as any) to avoid TypeScript needing node type definitions
* in the browser-targeted tsconfig.
*/
function isNodeEnv(): boolean {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const g = globalThis as any;
return (
typeof g.process === 'object' &&
g.process !== null &&
typeof g.process.versions === 'object' &&
typeof g.process.versions.node === 'string'
);
}
export class Tokenizer {
private spp: SentencePieceProcessor;
private ready: Promise<void>;
/**
* @param modelBytes - Raw bytes of the `.model` SentencePiece protobuf file.
*/
constructor(modelBytes: Uint8Array) {
this.spp = new SentencePieceProcessor();
this.ready = isNodeEnv()
? this._loadNode(modelBytes)
: this._loadBrowser(modelBytes);
}
/**
* Node / vitest path: write bytes to a real temp file, load it, then clean up.
* Dynamic imports keep node: modules out of Vite's browser dependency graph.
*/
private async _loadNode(modelBytes: Uint8Array): Promise<void> {
// Dynamic imports so Vite never statically analyses these node: specifiers.
// The ts-ignore comments suppress TS2591 errors in the browser-targeted tsconfig
// (which does not include @types/node in its `types` array).
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore – node:fs not in browser tsconfig types
const { writeFileSync, unlinkSync, rmdirSync, mkdtempSync } = await import('node:fs');
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore – node:os not in browser tsconfig types
const { tmpdir } = await import('node:os');
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore – node:path not in browser tsconfig types
const { join } = await import('node:path');
const dir = (mkdtempSync as Function)(join(tmpdir(), 'needle-sp-'));
const modelPath = join(dir, 'tokenizer.model');
try {
(writeFileSync as Function)(modelPath, modelBytes);
await this.spp.load(modelPath);
} finally {
try { (unlinkSync as Function)(modelPath); } catch { /* ignore */ }
try { (rmdirSync as Function)(dir); } catch { /* ignore */ }
}
}
/**
* Browser path: register bytes under a virtual path in the fs-shim, then
* tell sentencepiece-js to "load" that path. Because Vite aliases `fs` →
* sp-fs-shim.ts, the `fs.readFileSync(virtualPath)` call inside
* sentencepiece-js is intercepted and returns our registered bytes.
*/
private async _loadBrowser(modelBytes: Uint8Array): Promise<void> {
const { registerBytes } = await import('./sp-fs-shim');
registerBytes(SP_VIRTUAL_PATH, modelBytes);
await this.spp.load(SP_VIRTUAL_PATH);
}
/** Wait for the model to finish loading before calling encode / decode. */
async init(): Promise<void> {
await this.ready;
}
/**
* Encode `text` into a list of token IDs using the Needle SP BPE model.
* Identical to `spp.encode_as_ids(text)` in Python.
*/
encode(text: string): number[] {
return this.spp.encodeIds(text);
}
/**
* Decode a list of token IDs back into a string.
* Identical to `spp.decode(ids)` in Python.
*/
decode(ids: number[]): string {
return this.spp.decodeIds(ids);
}
/**
* Encode `text` into piece strings (for debugging / inspection).
*/
encodePieces(text: string): string[] {
return this.spp.encodePieces(text);
}
}
/**
* Convenience factory: create a `Tokenizer` and wait for it to be ready.
*
* @param modelBytes - Raw bytes of the `.model` file.
*/
export async function createTokenizer(modelBytes: Uint8Array): Promise<Tokenizer> {
const t = new Tokenizer(modelBytes);
await t.init();
return t;
}
|