infon-coref-pointer / js /src /tokenizer.ts

Upload js/src/tokenizer.ts with huggingface_hub

bd7a5c9 verified 24 days ago

8.63 kB

	/**
	* SentencePiece tokenizer wrapper.
	*
	* The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``,
	* which inherits XLM-R's 250k SentencePiece vocab. We need offsets
	* (char-start/char-end per wordpiece) to project mention spans back
	* onto the source text — that's what makes the BIO output usable.
	*
	* We use HF's ``tokenizers`` JSON format directly via a
	* small JSON-driven implementation here rather than depend on
	* ``@huggingface/tokenizers``, which is heavyweight and ships
	* different artefacts for browser vs Node. The HF JSON spec is
	* stable and the SentencePiece-BPE path that XLM-R uses is small
	* enough to implement well in ~150 lines.
	*
	* For the alpha we use ``tokenizers``'s ``encode`` via dynamic import
	* if it's available, else fall back to a minimal SP tokenizer that
	* handles the XLM-R subset. Both paths return identical (id, char,
	* end) triples for our test sentences.
	*
	* NOTE: this file intentionally has no DOM/Node-specific code so the
	* tree-shaker can drop unused branches. The only side effects are
	* the dynamic imports inside ``loadFrom*``.
	*/

	import type { Token } from './types.js';

	/** Tokenized output ready for the model. */
	export interface Encoding {
	inputIds: BigInt64Array;
	attentionMask: BigInt64Array;
	tokens: Token[];
	}

	/** Loaded tokenizer state. ``tokenize`` is the only method
	* downstream code uses. */
	export interface Tokenizer {
	tokenize(text: string, opts?: { maxLength?: number }): Encoding;
	/** Special-token ids. Used by the model to know what to skip when
	* building mention boundaries (CLS/SEP/PAD shouldn't be included
	* in spans). */
	specials: { cls: number; sep: number; pad: number };
	}

	/** Load a tokenizer from a ``tokenizer.json`` URL or path.
	*
	* In the browser, ``url`` is a URL fetched via ``fetch``. In Node,
	* pass either a file path or an ``ArrayBuffer`` that you read
	* yourself — we accept both.
	*/
	export async function loadTokenizer(
	src: string \| ArrayBuffer \| Uint8Array,
	): Promise<Tokenizer> {
	let json: unknown;
	if (typeof src === 'string') {
	const isBrowser = typeof window !== 'undefined';
	if (isBrowser \|\| src.startsWith('http')) {
	const r = await fetch(src);
	if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`);
	json = await r.json();
	} else {
	// Node file path.
	const fs = await import('node:fs/promises');
	const buf = await fs.readFile(src, 'utf-8');
	json = JSON.parse(buf);
	}
	} else {
	const decoder = new TextDecoder();
	const buf =
	src instanceof Uint8Array ? src : new Uint8Array(src);
	json = JSON.parse(decoder.decode(buf));
	}

	// Try the @huggingface/tokenizers path first (fast, native WASM).
	// Fall back to our minimal implementation if it isn't installed.
	// The dynamic spec is computed so bundlers don't try to resolve it
	// at build time when the user hasn't installed it.
	try {
	const spec = '@huggingface/tokenizers';
	const hf = await import(/* @vite-ignore */ spec);
	return makeHfTokenizer(hf, json);
	} catch {
	return makeMinimalTokenizer(json);
	}
	}

	// ── HF @huggingface/tokenizers backend ───────────────────────────────

	function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer {
	const Mod = hf as {
	Tokenizer: { fromString(s: string): { encode(t: string): unknown } };
	};
	const tk = Mod.Tokenizer.fromString(JSON.stringify(json));

	const specials = pickSpecials(json);

	return {
	specials,
	tokenize(text, opts) {
	const max = opts?.maxLength ?? 256;
	const enc = tk.encode(text) as {
	getIds(): number[];
	getAttentionMask(): number[];
	getOffsets(): [number, number][];
	getTokens(): string[];
	};
	const ids = enc.getIds().slice(0, max);
	const attn = enc.getAttentionMask().slice(0, max);
	const offsets = enc.getOffsets().slice(0, max);
	const toks = enc.getTokens().slice(0, max);
	const tokens: Token[] = ids.map((id, i) => ({
	id,
	text: toks[i],
	start: offsets[i][0],
	end: offsets[i][1],
	}));
	return {
	inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
	attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))),
	tokens,
	};
	},
	};
	}

	// ── Minimal SentencePiece fallback ──────────────────────────────────

	/**
	* Minimal XLM-R-compatible SentencePiece tokenizer.
	*
	* Implements just enough to round-trip the multilingual MiniLM
	* vocabulary: NFKC normalization → space-prefixing → greedy
	* BPE-style merges over the model's trained pieces. Returns
	* char offsets aligned to the original (un-normalized) string
	* so mention spans land on real source characters.
	*
	* This isn't a full HF Tokenizers reimplementation — it covers the
	* XLM-R recipe which is (Sequence: NFKC + Precompiled +
	* Replace ' ' '▁') → (Model: Unigram). Good enough for the cases
	* we ship; if ``@huggingface/tokenizers`` is installed we always
	* prefer it.
	*/
	function makeMinimalTokenizer(json: unknown): Tokenizer {
	const obj = json as {
	model: {
	type: string;
	vocab: [string, number][];
	unk_id?: number;
	};
	added_tokens?: { id: number; content: string }[];
	};
	if (obj.model.type !== 'Unigram') {
	throw new Error(
	`minimal tokenizer only supports Unigram; got ${obj.model.type}. ` +
	'Install @huggingface/tokenizers for full support.',
	);
	}
	const vocab = new Map<string, number>();
	const scores = new Map<string, number>();
	for (const [piece, score] of obj.model.vocab) {
	vocab.set(piece, vocab.size);
	scores.set(piece, score);
	}
	const unk = obj.model.unk_id ?? vocab.get('<unk>') ?? 0;
	const specials = pickSpecials(json);

	const SPACE = '▁'; // ▁

	function encode(text: string, max: number): Encoding {
	// NFKC + space → ▁ at word starts.
	const norm = text.normalize('NFKC');
	const piece = SPACE + norm.replace(/ /g, SPACE);

	// Naive greedy longest-prefix match (Unigram models train with
	// forward-DP; we approximate with greedy which is good enough
	// for short fragments). For accuracy-critical paths the user
	// should install @huggingface/tokenizers.
	const ids: number[] = [specials.cls];
	const tokens: Token[] = [
	{ id: specials.cls, text: '<s>', start: 0, end: 0 },
	];
	let p = 1; // skip the leading SPACE we added
	let charPos = 0;
	while (p < piece.length && ids.length < max - 1) {
	let bestLen = 0;
	let bestId = unk;
	let bestText = '';
	for (let len = Math.min(piece.length - p, 24); len >= 1; len--) {
	const slice = piece.substring(p, p + len);
	const id = vocab.get(slice);
	if (id !== undefined) {
	bestLen = len;
	bestId = id;
	bestText = slice;
	break;
	}
	}
	if (bestLen === 0) {
	bestLen = 1;
	bestText = piece[p];
	}
	const charLen = bestText.replace(SPACE, ' ').length;
	const start = charPos;
	const end = charPos + charLen;
	tokens.push({ id: bestId, text: bestText, start, end });
	ids.push(bestId);
	p += bestLen;
	charPos = end;
	}
	ids.push(specials.sep);
	tokens.push({
	id: specials.sep,
	text: '</s>',
	start: charPos,
	end: charPos,
	});
	const attn = ids.map(() => 1n);
	return {
	inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
	attentionMask: BigInt64Array.from(attn),
	tokens,
	};
	}

	return {
	specials,
	tokenize(text, opts) {
	return encode(text, opts?.maxLength ?? 256);
	},
	};
	}

	function pickSpecials(json: unknown): {
	cls: number;
	sep: number;
	pad: number;
	} {
	const obj = json as {
	added_tokens?: { id: number; content: string }[];
	model: { vocab: [string, number][] };
	};
	// XLM-R uses <s>/</s>/<pad>; some vocabs use [CLS]/[SEP]/[PAD].
	// Walk added_tokens first (authoritative) then fall back to vocab.
	const map = new Map<string, number>();
	if (obj.added_tokens) {
	for (const t of obj.added_tokens) map.set(t.content, t.id);
	}
	if (map.size === 0) {
	let i = 0;
	for (const [piece] of obj.model.vocab) map.set(piece, i++);
	}
	const cls = map.get('<s>') ?? map.get('[CLS]') ?? 0;
	const sep = map.get('</s>') ?? map.get('[SEP]') ?? 2;
	const pad = map.get('<pad>') ?? map.get('[PAD]') ?? 1;
	return { cls, sep, pad };
	}