Token Classification
Transformers.js
ONNX
bert
feature-extraction
coreference
multilingual
onnxruntime-web
Instructions to use cp500/infon-coref-pointer with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers.js
How to use cp500/infon-coref-pointer with Transformers.js:
// npm i @huggingface/transformers import { pipeline } from '@huggingface/transformers'; // Allocate pipeline const pipe = await pipeline('token-classification', 'cp500/infon-coref-pointer');
File size: 8,632 Bytes
bd7a5c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 | /**
* SentencePiece tokenizer wrapper.
*
* The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``,
* which inherits XLM-R's 250k SentencePiece vocab. We need offsets
* (char-start/char-end per wordpiece) to project mention spans back
* onto the source text β that's what makes the BIO output usable.
*
* We use HF's ``tokenizers`` JSON format directly via a
* small JSON-driven implementation here rather than depend on
* ``@huggingface/tokenizers``, which is heavyweight and ships
* different artefacts for browser vs Node. The HF JSON spec is
* stable and the SentencePiece-BPE path that XLM-R uses is small
* enough to implement well in ~150 lines.
*
* For the alpha we use ``tokenizers``'s ``encode`` via dynamic import
* if it's available, else fall back to a minimal SP tokenizer that
* handles the XLM-R subset. Both paths return identical (id, char,
* end) triples for our test sentences.
*
* NOTE: this file intentionally has no DOM/Node-specific code so the
* tree-shaker can drop unused branches. The only side effects are
* the dynamic imports inside ``loadFrom*``.
*/
import type { Token } from './types.js';
/** Tokenized output ready for the model. */
export interface Encoding {
inputIds: BigInt64Array;
attentionMask: BigInt64Array;
tokens: Token[];
}
/** Loaded tokenizer state. ``tokenize`` is the only method
* downstream code uses. */
export interface Tokenizer {
tokenize(text: string, opts?: { maxLength?: number }): Encoding;
/** Special-token ids. Used by the model to know what to skip when
* building mention boundaries (CLS/SEP/PAD shouldn't be included
* in spans). */
specials: { cls: number; sep: number; pad: number };
}
/** Load a tokenizer from a ``tokenizer.json`` URL or path.
*
* In the browser, ``url`` is a URL fetched via ``fetch``. In Node,
* pass either a file path or an ``ArrayBuffer`` that you read
* yourself β we accept both.
*/
export async function loadTokenizer(
src: string | ArrayBuffer | Uint8Array,
): Promise<Tokenizer> {
let json: unknown;
if (typeof src === 'string') {
const isBrowser = typeof window !== 'undefined';
if (isBrowser || src.startsWith('http')) {
const r = await fetch(src);
if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`);
json = await r.json();
} else {
// Node file path.
const fs = await import('node:fs/promises');
const buf = await fs.readFile(src, 'utf-8');
json = JSON.parse(buf);
}
} else {
const decoder = new TextDecoder();
const buf =
src instanceof Uint8Array ? src : new Uint8Array(src);
json = JSON.parse(decoder.decode(buf));
}
// Try the @huggingface/tokenizers path first (fast, native WASM).
// Fall back to our minimal implementation if it isn't installed.
// The dynamic spec is computed so bundlers don't try to resolve it
// at build time when the user hasn't installed it.
try {
const spec = '@huggingface/tokenizers';
const hf = await import(/* @vite-ignore */ spec);
return makeHfTokenizer(hf, json);
} catch {
return makeMinimalTokenizer(json);
}
}
// ββ HF @huggingface/tokenizers backend βββββββββββββββββββββββββββββββ
function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer {
const Mod = hf as {
Tokenizer: { fromString(s: string): { encode(t: string): unknown } };
};
const tk = Mod.Tokenizer.fromString(JSON.stringify(json));
const specials = pickSpecials(json);
return {
specials,
tokenize(text, opts) {
const max = opts?.maxLength ?? 256;
const enc = tk.encode(text) as {
getIds(): number[];
getAttentionMask(): number[];
getOffsets(): [number, number][];
getTokens(): string[];
};
const ids = enc.getIds().slice(0, max);
const attn = enc.getAttentionMask().slice(0, max);
const offsets = enc.getOffsets().slice(0, max);
const toks = enc.getTokens().slice(0, max);
const tokens: Token[] = ids.map((id, i) => ({
id,
text: toks[i],
start: offsets[i][0],
end: offsets[i][1],
}));
return {
inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))),
tokens,
};
},
};
}
// ββ Minimal SentencePiece fallback ββββββββββββββββββββββββββββββββββ
/**
* Minimal XLM-R-compatible SentencePiece tokenizer.
*
* Implements just enough to round-trip the multilingual MiniLM
* vocabulary: NFKC normalization β space-prefixing β greedy
* BPE-style merges over the model's trained pieces. Returns
* char offsets aligned to the *original* (un-normalized) string
* so mention spans land on real source characters.
*
* This isn't a full HF Tokenizers reimplementation β it covers the
* XLM-R recipe which is (Sequence: NFKC + Precompiled +
* Replace ' ' 'β') β (Model: Unigram). Good enough for the cases
* we ship; if ``@huggingface/tokenizers`` is installed we always
* prefer it.
*/
function makeMinimalTokenizer(json: unknown): Tokenizer {
const obj = json as {
model: {
type: string;
vocab: [string, number][];
unk_id?: number;
};
added_tokens?: { id: number; content: string }[];
};
if (obj.model.type !== 'Unigram') {
throw new Error(
`minimal tokenizer only supports Unigram; got ${obj.model.type}. ` +
'Install @huggingface/tokenizers for full support.',
);
}
const vocab = new Map<string, number>();
const scores = new Map<string, number>();
for (const [piece, score] of obj.model.vocab) {
vocab.set(piece, vocab.size);
scores.set(piece, score);
}
const unk = obj.model.unk_id ?? vocab.get('<unk>') ?? 0;
const specials = pickSpecials(json);
const SPACE = 'β'; // β
function encode(text: string, max: number): Encoding {
// NFKC + space β β at word starts.
const norm = text.normalize('NFKC');
const piece = SPACE + norm.replace(/ /g, SPACE);
// Naive greedy longest-prefix match (Unigram models train with
// forward-DP; we approximate with greedy which is good enough
// for short fragments). For accuracy-critical paths the user
// should install @huggingface/tokenizers.
const ids: number[] = [specials.cls];
const tokens: Token[] = [
{ id: specials.cls, text: '<s>', start: 0, end: 0 },
];
let p = 1; // skip the leading SPACE we added
let charPos = 0;
while (p < piece.length && ids.length < max - 1) {
let bestLen = 0;
let bestId = unk;
let bestText = '';
for (let len = Math.min(piece.length - p, 24); len >= 1; len--) {
const slice = piece.substring(p, p + len);
const id = vocab.get(slice);
if (id !== undefined) {
bestLen = len;
bestId = id;
bestText = slice;
break;
}
}
if (bestLen === 0) {
bestLen = 1;
bestText = piece[p];
}
const charLen = bestText.replace(SPACE, ' ').length;
const start = charPos;
const end = charPos + charLen;
tokens.push({ id: bestId, text: bestText, start, end });
ids.push(bestId);
p += bestLen;
charPos = end;
}
ids.push(specials.sep);
tokens.push({
id: specials.sep,
text: '</s>',
start: charPos,
end: charPos,
});
const attn = ids.map(() => 1n);
return {
inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
attentionMask: BigInt64Array.from(attn),
tokens,
};
}
return {
specials,
tokenize(text, opts) {
return encode(text, opts?.maxLength ?? 256);
},
};
}
function pickSpecials(json: unknown): {
cls: number;
sep: number;
pad: number;
} {
const obj = json as {
added_tokens?: { id: number; content: string }[];
model: { vocab: [string, number][] };
};
// XLM-R uses <s>/</s>/<pad>; some vocabs use [CLS]/[SEP]/[PAD].
// Walk added_tokens first (authoritative) then fall back to vocab.
const map = new Map<string, number>();
if (obj.added_tokens) {
for (const t of obj.added_tokens) map.set(t.content, t.id);
}
if (map.size === 0) {
let i = 0;
for (const [piece] of obj.model.vocab) map.set(piece, i++);
}
const cls = map.get('<s>') ?? map.get('[CLS]') ?? 0;
const sep = map.get('</s>') ?? map.get('[SEP]') ?? 2;
const pad = map.get('<pad>') ?? map.get('[PAD]') ?? 1;
return { cls, sep, pad };
}
|