Token Classification
Transformers.js
ONNX
bert
feature-extraction
coreference
multilingual
onnxruntime-web
Instructions to use cp500/infon-coref-pointer with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers.js
How to use cp500/infon-coref-pointer with Transformers.js:
// npm i @huggingface/transformers import { pipeline } from '@huggingface/transformers'; // Allocate pipeline const pipe = await pipeline('token-classification', 'cp500/infon-coref-pointer');
| /** | |
| * SentencePiece tokenizer wrapper. | |
| * | |
| * The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``, | |
| * which inherits XLM-R's 250k SentencePiece vocab. We need offsets | |
| * (char-start/char-end per wordpiece) to project mention spans back | |
| * onto the source text β that's what makes the BIO output usable. | |
| * | |
| * We use HF's ``tokenizers`` JSON format directly via a | |
| * small JSON-driven implementation here rather than depend on | |
| * ``@huggingface/tokenizers``, which is heavyweight and ships | |
| * different artefacts for browser vs Node. The HF JSON spec is | |
| * stable and the SentencePiece-BPE path that XLM-R uses is small | |
| * enough to implement well in ~150 lines. | |
| * | |
| * For the alpha we use ``tokenizers``'s ``encode`` via dynamic import | |
| * if it's available, else fall back to a minimal SP tokenizer that | |
| * handles the XLM-R subset. Both paths return identical (id, char, | |
| * end) triples for our test sentences. | |
| * | |
| * NOTE: this file intentionally has no DOM/Node-specific code so the | |
| * tree-shaker can drop unused branches. The only side effects are | |
| * the dynamic imports inside ``loadFrom*``. | |
| */ | |
| import type { Token } from './types.js'; | |
| /** Tokenized output ready for the model. */ | |
| export interface Encoding { | |
| inputIds: BigInt64Array; | |
| attentionMask: BigInt64Array; | |
| tokens: Token[]; | |
| } | |
| /** Loaded tokenizer state. ``tokenize`` is the only method | |
| * downstream code uses. */ | |
| export interface Tokenizer { | |
| tokenize(text: string, opts?: { maxLength?: number }): Encoding; | |
| /** Special-token ids. Used by the model to know what to skip when | |
| * building mention boundaries (CLS/SEP/PAD shouldn't be included | |
| * in spans). */ | |
| specials: { cls: number; sep: number; pad: number }; | |
| } | |
| /** Load a tokenizer from a ``tokenizer.json`` URL or path. | |
| * | |
| * In the browser, ``url`` is a URL fetched via ``fetch``. In Node, | |
| * pass either a file path or an ``ArrayBuffer`` that you read | |
| * yourself β we accept both. | |
| */ | |
| export async function loadTokenizer( | |
| src: string | ArrayBuffer | Uint8Array, | |
| ): Promise<Tokenizer> { | |
| let json: unknown; | |
| if (typeof src === 'string') { | |
| const isBrowser = typeof window !== 'undefined'; | |
| if (isBrowser || src.startsWith('http')) { | |
| const r = await fetch(src); | |
| if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`); | |
| json = await r.json(); | |
| } else { | |
| // Node file path. | |
| const fs = await import('node:fs/promises'); | |
| const buf = await fs.readFile(src, 'utf-8'); | |
| json = JSON.parse(buf); | |
| } | |
| } else { | |
| const decoder = new TextDecoder(); | |
| const buf = | |
| src instanceof Uint8Array ? src : new Uint8Array(src); | |
| json = JSON.parse(decoder.decode(buf)); | |
| } | |
| // Try the @huggingface/tokenizers path first (fast, native WASM). | |
| // Fall back to our minimal implementation if it isn't installed. | |
| // The dynamic spec is computed so bundlers don't try to resolve it | |
| // at build time when the user hasn't installed it. | |
| try { | |
| const spec = '@huggingface/tokenizers'; | |
| const hf = await import(/* @vite-ignore */ spec); | |
| return makeHfTokenizer(hf, json); | |
| } catch { | |
| return makeMinimalTokenizer(json); | |
| } | |
| } | |
| // ββ HF @huggingface/tokenizers backend βββββββββββββββββββββββββββββββ | |
| function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer { | |
| const Mod = hf as { | |
| Tokenizer: { fromString(s: string): { encode(t: string): unknown } }; | |
| }; | |
| const tk = Mod.Tokenizer.fromString(JSON.stringify(json)); | |
| const specials = pickSpecials(json); | |
| return { | |
| specials, | |
| tokenize(text, opts) { | |
| const max = opts?.maxLength ?? 256; | |
| const enc = tk.encode(text) as { | |
| getIds(): number[]; | |
| getAttentionMask(): number[]; | |
| getOffsets(): [number, number][]; | |
| getTokens(): string[]; | |
| }; | |
| const ids = enc.getIds().slice(0, max); | |
| const attn = enc.getAttentionMask().slice(0, max); | |
| const offsets = enc.getOffsets().slice(0, max); | |
| const toks = enc.getTokens().slice(0, max); | |
| const tokens: Token[] = ids.map((id, i) => ({ | |
| id, | |
| text: toks[i], | |
| start: offsets[i][0], | |
| end: offsets[i][1], | |
| })); | |
| return { | |
| inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))), | |
| attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))), | |
| tokens, | |
| }; | |
| }, | |
| }; | |
| } | |
| // ββ Minimal SentencePiece fallback ββββββββββββββββββββββββββββββββββ | |
| /** | |
| * Minimal XLM-R-compatible SentencePiece tokenizer. | |
| * | |
| * Implements just enough to round-trip the multilingual MiniLM | |
| * vocabulary: NFKC normalization β space-prefixing β greedy | |
| * BPE-style merges over the model's trained pieces. Returns | |
| * char offsets aligned to the *original* (un-normalized) string | |
| * so mention spans land on real source characters. | |
| * | |
| * This isn't a full HF Tokenizers reimplementation β it covers the | |
| * XLM-R recipe which is (Sequence: NFKC + Precompiled + | |
| * Replace ' ' 'β') β (Model: Unigram). Good enough for the cases | |
| * we ship; if ``@huggingface/tokenizers`` is installed we always | |
| * prefer it. | |
| */ | |
| function makeMinimalTokenizer(json: unknown): Tokenizer { | |
| const obj = json as { | |
| model: { | |
| type: string; | |
| vocab: [string, number][]; | |
| unk_id?: number; | |
| }; | |
| added_tokens?: { id: number; content: string }[]; | |
| }; | |
| if (obj.model.type !== 'Unigram') { | |
| throw new Error( | |
| `minimal tokenizer only supports Unigram; got ${obj.model.type}. ` + | |
| 'Install @huggingface/tokenizers for full support.', | |
| ); | |
| } | |
| const vocab = new Map<string, number>(); | |
| const scores = new Map<string, number>(); | |
| for (const [piece, score] of obj.model.vocab) { | |
| vocab.set(piece, vocab.size); | |
| scores.set(piece, score); | |
| } | |
| const unk = obj.model.unk_id ?? vocab.get('<unk>') ?? 0; | |
| const specials = pickSpecials(json); | |
| const SPACE = 'β'; // β | |
| function encode(text: string, max: number): Encoding { | |
| // NFKC + space β β at word starts. | |
| const norm = text.normalize('NFKC'); | |
| const piece = SPACE + norm.replace(/ /g, SPACE); | |
| // Naive greedy longest-prefix match (Unigram models train with | |
| // forward-DP; we approximate with greedy which is good enough | |
| // for short fragments). For accuracy-critical paths the user | |
| // should install @huggingface/tokenizers. | |
| const ids: number[] = [specials.cls]; | |
| const tokens: Token[] = [ | |
| { id: specials.cls, text: '<s>', start: 0, end: 0 }, | |
| ]; | |
| let p = 1; // skip the leading SPACE we added | |
| let charPos = 0; | |
| while (p < piece.length && ids.length < max - 1) { | |
| let bestLen = 0; | |
| let bestId = unk; | |
| let bestText = ''; | |
| for (let len = Math.min(piece.length - p, 24); len >= 1; len--) { | |
| const slice = piece.substring(p, p + len); | |
| const id = vocab.get(slice); | |
| if (id !== undefined) { | |
| bestLen = len; | |
| bestId = id; | |
| bestText = slice; | |
| break; | |
| } | |
| } | |
| if (bestLen === 0) { | |
| bestLen = 1; | |
| bestText = piece[p]; | |
| } | |
| const charLen = bestText.replace(SPACE, ' ').length; | |
| const start = charPos; | |
| const end = charPos + charLen; | |
| tokens.push({ id: bestId, text: bestText, start, end }); | |
| ids.push(bestId); | |
| p += bestLen; | |
| charPos = end; | |
| } | |
| ids.push(specials.sep); | |
| tokens.push({ | |
| id: specials.sep, | |
| text: '</s>', | |
| start: charPos, | |
| end: charPos, | |
| }); | |
| const attn = ids.map(() => 1n); | |
| return { | |
| inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))), | |
| attentionMask: BigInt64Array.from(attn), | |
| tokens, | |
| }; | |
| } | |
| return { | |
| specials, | |
| tokenize(text, opts) { | |
| return encode(text, opts?.maxLength ?? 256); | |
| }, | |
| }; | |
| } | |
| function pickSpecials(json: unknown): { | |
| cls: number; | |
| sep: number; | |
| pad: number; | |
| } { | |
| const obj = json as { | |
| added_tokens?: { id: number; content: string }[]; | |
| model: { vocab: [string, number][] }; | |
| }; | |
| // XLM-R uses <s>/</s>/<pad>; some vocabs use [CLS]/[SEP]/[PAD]. | |
| // Walk added_tokens first (authoritative) then fall back to vocab. | |
| const map = new Map<string, number>(); | |
| if (obj.added_tokens) { | |
| for (const t of obj.added_tokens) map.set(t.content, t.id); | |
| } | |
| if (map.size === 0) { | |
| let i = 0; | |
| for (const [piece] of obj.model.vocab) map.set(piece, i++); | |
| } | |
| const cls = map.get('<s>') ?? map.get('[CLS]') ?? 0; | |
| const sep = map.get('</s>') ?? map.get('[SEP]') ?? 2; | |
| const pad = map.get('<pad>') ?? map.get('[PAD]') ?? 1; | |
| return { cls, sep, pad }; | |
| } | |