infon-coref-pointer / js /src /tokenizer.ts
cp500's picture
Upload js/src/tokenizer.ts with huggingface_hub
bd7a5c9 verified
Raw
History Blame Contribute Delete
8.63 kB
/**
* SentencePiece tokenizer wrapper.
*
* The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``,
* which inherits XLM-R's 250k SentencePiece vocab. We need offsets
* (char-start/char-end per wordpiece) to project mention spans back
* onto the source text β€” that's what makes the BIO output usable.
*
* We use HF's ``tokenizers`` JSON format directly via a
* small JSON-driven implementation here rather than depend on
* ``@huggingface/tokenizers``, which is heavyweight and ships
* different artefacts for browser vs Node. The HF JSON spec is
* stable and the SentencePiece-BPE path that XLM-R uses is small
* enough to implement well in ~150 lines.
*
* For the alpha we use ``tokenizers``'s ``encode`` via dynamic import
* if it's available, else fall back to a minimal SP tokenizer that
* handles the XLM-R subset. Both paths return identical (id, char,
* end) triples for our test sentences.
*
* NOTE: this file intentionally has no DOM/Node-specific code so the
* tree-shaker can drop unused branches. The only side effects are
* the dynamic imports inside ``loadFrom*``.
*/
import type { Token } from './types.js';
/** Tokenized output ready for the model. */
export interface Encoding {
inputIds: BigInt64Array;
attentionMask: BigInt64Array;
tokens: Token[];
}
/** Loaded tokenizer state. ``tokenize`` is the only method
* downstream code uses. */
export interface Tokenizer {
tokenize(text: string, opts?: { maxLength?: number }): Encoding;
/** Special-token ids. Used by the model to know what to skip when
* building mention boundaries (CLS/SEP/PAD shouldn't be included
* in spans). */
specials: { cls: number; sep: number; pad: number };
}
/** Load a tokenizer from a ``tokenizer.json`` URL or path.
*
* In the browser, ``url`` is a URL fetched via ``fetch``. In Node,
* pass either a file path or an ``ArrayBuffer`` that you read
* yourself β€” we accept both.
*/
export async function loadTokenizer(
src: string | ArrayBuffer | Uint8Array,
): Promise<Tokenizer> {
let json: unknown;
if (typeof src === 'string') {
const isBrowser = typeof window !== 'undefined';
if (isBrowser || src.startsWith('http')) {
const r = await fetch(src);
if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`);
json = await r.json();
} else {
// Node file path.
const fs = await import('node:fs/promises');
const buf = await fs.readFile(src, 'utf-8');
json = JSON.parse(buf);
}
} else {
const decoder = new TextDecoder();
const buf =
src instanceof Uint8Array ? src : new Uint8Array(src);
json = JSON.parse(decoder.decode(buf));
}
// Try the @huggingface/tokenizers path first (fast, native WASM).
// Fall back to our minimal implementation if it isn't installed.
// The dynamic spec is computed so bundlers don't try to resolve it
// at build time when the user hasn't installed it.
try {
const spec = '@huggingface/tokenizers';
const hf = await import(/* @vite-ignore */ spec);
return makeHfTokenizer(hf, json);
} catch {
return makeMinimalTokenizer(json);
}
}
// ── HF @huggingface/tokenizers backend ───────────────────────────────
function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer {
const Mod = hf as {
Tokenizer: { fromString(s: string): { encode(t: string): unknown } };
};
const tk = Mod.Tokenizer.fromString(JSON.stringify(json));
const specials = pickSpecials(json);
return {
specials,
tokenize(text, opts) {
const max = opts?.maxLength ?? 256;
const enc = tk.encode(text) as {
getIds(): number[];
getAttentionMask(): number[];
getOffsets(): [number, number][];
getTokens(): string[];
};
const ids = enc.getIds().slice(0, max);
const attn = enc.getAttentionMask().slice(0, max);
const offsets = enc.getOffsets().slice(0, max);
const toks = enc.getTokens().slice(0, max);
const tokens: Token[] = ids.map((id, i) => ({
id,
text: toks[i],
start: offsets[i][0],
end: offsets[i][1],
}));
return {
inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))),
tokens,
};
},
};
}
// ── Minimal SentencePiece fallback ──────────────────────────────────
/**
* Minimal XLM-R-compatible SentencePiece tokenizer.
*
* Implements just enough to round-trip the multilingual MiniLM
* vocabulary: NFKC normalization β†’ space-prefixing β†’ greedy
* BPE-style merges over the model's trained pieces. Returns
* char offsets aligned to the *original* (un-normalized) string
* so mention spans land on real source characters.
*
* This isn't a full HF Tokenizers reimplementation β€” it covers the
* XLM-R recipe which is (Sequence: NFKC + Precompiled +
* Replace ' ' '▁') β†’ (Model: Unigram). Good enough for the cases
* we ship; if ``@huggingface/tokenizers`` is installed we always
* prefer it.
*/
function makeMinimalTokenizer(json: unknown): Tokenizer {
const obj = json as {
model: {
type: string;
vocab: [string, number][];
unk_id?: number;
};
added_tokens?: { id: number; content: string }[];
};
if (obj.model.type !== 'Unigram') {
throw new Error(
`minimal tokenizer only supports Unigram; got ${obj.model.type}. ` +
'Install @huggingface/tokenizers for full support.',
);
}
const vocab = new Map<string, number>();
const scores = new Map<string, number>();
for (const [piece, score] of obj.model.vocab) {
vocab.set(piece, vocab.size);
scores.set(piece, score);
}
const unk = obj.model.unk_id ?? vocab.get('<unk>') ?? 0;
const specials = pickSpecials(json);
const SPACE = '▁'; // ▁
function encode(text: string, max: number): Encoding {
// NFKC + space β†’ ▁ at word starts.
const norm = text.normalize('NFKC');
const piece = SPACE + norm.replace(/ /g, SPACE);
// Naive greedy longest-prefix match (Unigram models train with
// forward-DP; we approximate with greedy which is good enough
// for short fragments). For accuracy-critical paths the user
// should install @huggingface/tokenizers.
const ids: number[] = [specials.cls];
const tokens: Token[] = [
{ id: specials.cls, text: '<s>', start: 0, end: 0 },
];
let p = 1; // skip the leading SPACE we added
let charPos = 0;
while (p < piece.length && ids.length < max - 1) {
let bestLen = 0;
let bestId = unk;
let bestText = '';
for (let len = Math.min(piece.length - p, 24); len >= 1; len--) {
const slice = piece.substring(p, p + len);
const id = vocab.get(slice);
if (id !== undefined) {
bestLen = len;
bestId = id;
bestText = slice;
break;
}
}
if (bestLen === 0) {
bestLen = 1;
bestText = piece[p];
}
const charLen = bestText.replace(SPACE, ' ').length;
const start = charPos;
const end = charPos + charLen;
tokens.push({ id: bestId, text: bestText, start, end });
ids.push(bestId);
p += bestLen;
charPos = end;
}
ids.push(specials.sep);
tokens.push({
id: specials.sep,
text: '</s>',
start: charPos,
end: charPos,
});
const attn = ids.map(() => 1n);
return {
inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
attentionMask: BigInt64Array.from(attn),
tokens,
};
}
return {
specials,
tokenize(text, opts) {
return encode(text, opts?.maxLength ?? 256);
},
};
}
function pickSpecials(json: unknown): {
cls: number;
sep: number;
pad: number;
} {
const obj = json as {
added_tokens?: { id: number; content: string }[];
model: { vocab: [string, number][] };
};
// XLM-R uses <s>/</s>/<pad>; some vocabs use [CLS]/[SEP]/[PAD].
// Walk added_tokens first (authoritative) then fall back to vocab.
const map = new Map<string, number>();
if (obj.added_tokens) {
for (const t of obj.added_tokens) map.set(t.content, t.id);
}
if (map.size === 0) {
let i = 0;
for (const [piece] of obj.model.vocab) map.set(piece, i++);
}
const cls = map.get('<s>') ?? map.get('[CLS]') ?? 0;
const sep = map.get('</s>') ?? map.get('[SEP]') ?? 2;
const pad = map.get('<pad>') ?? map.get('[PAD]') ?? 1;
return { cls, sep, pad };
}