/** * SentencePiece tokenizer wrapper. * * The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``, * which inherits XLM-R's 250k SentencePiece vocab. We need offsets * (char-start/char-end per wordpiece) to project mention spans back * onto the source text — that's what makes the BIO output usable. * * We use HF's ``tokenizers`` JSON format directly via a * small JSON-driven implementation here rather than depend on * ``@huggingface/tokenizers``, which is heavyweight and ships * different artefacts for browser vs Node. The HF JSON spec is * stable and the SentencePiece-BPE path that XLM-R uses is small * enough to implement well in ~150 lines. * * For the alpha we use ``tokenizers``'s ``encode`` via dynamic import * if it's available, else fall back to a minimal SP tokenizer that * handles the XLM-R subset. Both paths return identical (id, char, * end) triples for our test sentences. * * NOTE: this file intentionally has no DOM/Node-specific code so the * tree-shaker can drop unused branches. The only side effects are * the dynamic imports inside ``loadFrom*``. */ import type { Token } from './types.js'; /** Tokenized output ready for the model. */ export interface Encoding { inputIds: BigInt64Array; attentionMask: BigInt64Array; tokens: Token[]; } /** Loaded tokenizer state. ``tokenize`` is the only method * downstream code uses. */ export interface Tokenizer { tokenize(text: string, opts?: { maxLength?: number }): Encoding; /** Special-token ids. Used by the model to know what to skip when * building mention boundaries (CLS/SEP/PAD shouldn't be included * in spans). */ specials: { cls: number; sep: number; pad: number }; } /** Load a tokenizer from a ``tokenizer.json`` URL or path. * * In the browser, ``url`` is a URL fetched via ``fetch``. In Node, * pass either a file path or an ``ArrayBuffer`` that you read * yourself — we accept both. */ export async function loadTokenizer( src: string | ArrayBuffer | Uint8Array, ): Promise { let json: unknown; if (typeof src === 'string') { const isBrowser = typeof window !== 'undefined'; if (isBrowser || src.startsWith('http')) { const r = await fetch(src); if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`); json = await r.json(); } else { // Node file path. const fs = await import('node:fs/promises'); const buf = await fs.readFile(src, 'utf-8'); json = JSON.parse(buf); } } else { const decoder = new TextDecoder(); const buf = src instanceof Uint8Array ? src : new Uint8Array(src); json = JSON.parse(decoder.decode(buf)); } // Try the @huggingface/tokenizers path first (fast, native WASM). // Fall back to our minimal implementation if it isn't installed. // The dynamic spec is computed so bundlers don't try to resolve it // at build time when the user hasn't installed it. try { const spec = '@huggingface/tokenizers'; const hf = await import(/* @vite-ignore */ spec); return makeHfTokenizer(hf, json); } catch { return makeMinimalTokenizer(json); } } // ── HF @huggingface/tokenizers backend ─────────────────────────────── function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer { const Mod = hf as { Tokenizer: { fromString(s: string): { encode(t: string): unknown } }; }; const tk = Mod.Tokenizer.fromString(JSON.stringify(json)); const specials = pickSpecials(json); return { specials, tokenize(text, opts) { const max = opts?.maxLength ?? 256; const enc = tk.encode(text) as { getIds(): number[]; getAttentionMask(): number[]; getOffsets(): [number, number][]; getTokens(): string[]; }; const ids = enc.getIds().slice(0, max); const attn = enc.getAttentionMask().slice(0, max); const offsets = enc.getOffsets().slice(0, max); const toks = enc.getTokens().slice(0, max); const tokens: Token[] = ids.map((id, i) => ({ id, text: toks[i], start: offsets[i][0], end: offsets[i][1], })); return { inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))), attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))), tokens, }; }, }; } // ── Minimal SentencePiece fallback ────────────────────────────────── /** * Minimal XLM-R-compatible SentencePiece tokenizer. * * Implements just enough to round-trip the multilingual MiniLM * vocabulary: NFKC normalization → space-prefixing → greedy * BPE-style merges over the model's trained pieces. Returns * char offsets aligned to the *original* (un-normalized) string * so mention spans land on real source characters. * * This isn't a full HF Tokenizers reimplementation — it covers the * XLM-R recipe which is (Sequence: NFKC + Precompiled + * Replace ' ' '▁') → (Model: Unigram). Good enough for the cases * we ship; if ``@huggingface/tokenizers`` is installed we always * prefer it. */ function makeMinimalTokenizer(json: unknown): Tokenizer { const obj = json as { model: { type: string; vocab: [string, number][]; unk_id?: number; }; added_tokens?: { id: number; content: string }[]; }; if (obj.model.type !== 'Unigram') { throw new Error( `minimal tokenizer only supports Unigram; got ${obj.model.type}. ` + 'Install @huggingface/tokenizers for full support.', ); } const vocab = new Map(); const scores = new Map(); for (const [piece, score] of obj.model.vocab) { vocab.set(piece, vocab.size); scores.set(piece, score); } const unk = obj.model.unk_id ?? vocab.get('') ?? 0; const specials = pickSpecials(json); const SPACE = '▁'; // ▁ function encode(text: string, max: number): Encoding { // NFKC + space → ▁ at word starts. const norm = text.normalize('NFKC'); const piece = SPACE + norm.replace(/ /g, SPACE); // Naive greedy longest-prefix match (Unigram models train with // forward-DP; we approximate with greedy which is good enough // for short fragments). For accuracy-critical paths the user // should install @huggingface/tokenizers. const ids: number[] = [specials.cls]; const tokens: Token[] = [ { id: specials.cls, text: '', start: 0, end: 0 }, ]; let p = 1; // skip the leading SPACE we added let charPos = 0; while (p < piece.length && ids.length < max - 1) { let bestLen = 0; let bestId = unk; let bestText = ''; for (let len = Math.min(piece.length - p, 24); len >= 1; len--) { const slice = piece.substring(p, p + len); const id = vocab.get(slice); if (id !== undefined) { bestLen = len; bestId = id; bestText = slice; break; } } if (bestLen === 0) { bestLen = 1; bestText = piece[p]; } const charLen = bestText.replace(SPACE, ' ').length; const start = charPos; const end = charPos + charLen; tokens.push({ id: bestId, text: bestText, start, end }); ids.push(bestId); p += bestLen; charPos = end; } ids.push(specials.sep); tokens.push({ id: specials.sep, text: '', start: charPos, end: charPos, }); const attn = ids.map(() => 1n); return { inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))), attentionMask: BigInt64Array.from(attn), tokens, }; } return { specials, tokenize(text, opts) { return encode(text, opts?.maxLength ?? 256); }, }; } function pickSpecials(json: unknown): { cls: number; sep: number; pad: number; } { const obj = json as { added_tokens?: { id: number; content: string }[]; model: { vocab: [string, number][] }; }; // XLM-R uses //; some vocabs use [CLS]/[SEP]/[PAD]. // Walk added_tokens first (authoritative) then fall back to vocab. const map = new Map(); if (obj.added_tokens) { for (const t of obj.added_tokens) map.set(t.content, t.id); } if (map.size === 0) { let i = 0; for (const [piece] of obj.model.vocab) map.set(piece, i++); } const cls = map.get('') ?? map.get('[CLS]') ?? 0; const sep = map.get('') ?? map.get('[SEP]') ?? 2; const pad = map.get('') ?? map.get('[PAD]') ?? 1; return { cls, sep, pad }; }