File size: 8,632 Bytes
bd7a5c9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
/**
 * SentencePiece tokenizer wrapper.
 *
 * The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``,
 * which inherits XLM-R's 250k SentencePiece vocab. We need offsets
 * (char-start/char-end per wordpiece) to project mention spans back
 * onto the source text β€” that's what makes the BIO output usable.
 *
 * We use HF's ``tokenizers`` JSON format directly via a
 * small JSON-driven implementation here rather than depend on
 * ``@huggingface/tokenizers``, which is heavyweight and ships
 * different artefacts for browser vs Node. The HF JSON spec is
 * stable and the SentencePiece-BPE path that XLM-R uses is small
 * enough to implement well in ~150 lines.
 *
 * For the alpha we use ``tokenizers``'s ``encode`` via dynamic import
 * if it's available, else fall back to a minimal SP tokenizer that
 * handles the XLM-R subset. Both paths return identical (id, char,
 * end) triples for our test sentences.
 *
 * NOTE: this file intentionally has no DOM/Node-specific code so the
 * tree-shaker can drop unused branches. The only side effects are
 * the dynamic imports inside ``loadFrom*``.
 */

import type { Token } from './types.js';

/** Tokenized output ready for the model. */
export interface Encoding {
  inputIds: BigInt64Array;
  attentionMask: BigInt64Array;
  tokens: Token[];
}

/** Loaded tokenizer state. ``tokenize`` is the only method
 * downstream code uses. */
export interface Tokenizer {
  tokenize(text: string, opts?: { maxLength?: number }): Encoding;
  /** Special-token ids. Used by the model to know what to skip when
   * building mention boundaries (CLS/SEP/PAD shouldn't be included
   * in spans). */
  specials: { cls: number; sep: number; pad: number };
}

/** Load a tokenizer from a ``tokenizer.json`` URL or path.
 *
 * In the browser, ``url`` is a URL fetched via ``fetch``. In Node,
 * pass either a file path or an ``ArrayBuffer`` that you read
 * yourself β€” we accept both.
 */
export async function loadTokenizer(
  src: string | ArrayBuffer | Uint8Array,
): Promise<Tokenizer> {
  let json: unknown;
  if (typeof src === 'string') {
    const isBrowser = typeof window !== 'undefined';
    if (isBrowser || src.startsWith('http')) {
      const r = await fetch(src);
      if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`);
      json = await r.json();
    } else {
      // Node file path.
      const fs = await import('node:fs/promises');
      const buf = await fs.readFile(src, 'utf-8');
      json = JSON.parse(buf);
    }
  } else {
    const decoder = new TextDecoder();
    const buf =
      src instanceof Uint8Array ? src : new Uint8Array(src);
    json = JSON.parse(decoder.decode(buf));
  }

  // Try the @huggingface/tokenizers path first (fast, native WASM).
  // Fall back to our minimal implementation if it isn't installed.
  // The dynamic spec is computed so bundlers don't try to resolve it
  // at build time when the user hasn't installed it.
  try {
    const spec = '@huggingface/tokenizers';
    const hf = await import(/* @vite-ignore */ spec);
    return makeHfTokenizer(hf, json);
  } catch {
    return makeMinimalTokenizer(json);
  }
}

// ── HF @huggingface/tokenizers backend ───────────────────────────────

function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer {
  const Mod = hf as {
    Tokenizer: { fromString(s: string): { encode(t: string): unknown } };
  };
  const tk = Mod.Tokenizer.fromString(JSON.stringify(json));

  const specials = pickSpecials(json);

  return {
    specials,
    tokenize(text, opts) {
      const max = opts?.maxLength ?? 256;
      const enc = tk.encode(text) as {
        getIds(): number[];
        getAttentionMask(): number[];
        getOffsets(): [number, number][];
        getTokens(): string[];
      };
      const ids = enc.getIds().slice(0, max);
      const attn = enc.getAttentionMask().slice(0, max);
      const offsets = enc.getOffsets().slice(0, max);
      const toks = enc.getTokens().slice(0, max);
      const tokens: Token[] = ids.map((id, i) => ({
        id,
        text: toks[i],
        start: offsets[i][0],
        end: offsets[i][1],
      }));
      return {
        inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
        attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))),
        tokens,
      };
    },
  };
}

// ── Minimal SentencePiece fallback ──────────────────────────────────

/**
 * Minimal XLM-R-compatible SentencePiece tokenizer.
 *
 * Implements just enough to round-trip the multilingual MiniLM
 * vocabulary: NFKC normalization β†’ space-prefixing β†’ greedy
 * BPE-style merges over the model's trained pieces. Returns
 * char offsets aligned to the *original* (un-normalized) string
 * so mention spans land on real source characters.
 *
 * This isn't a full HF Tokenizers reimplementation β€” it covers the
 * XLM-R recipe which is (Sequence: NFKC + Precompiled +
 * Replace ' ' '▁') β†’ (Model: Unigram). Good enough for the cases
 * we ship; if ``@huggingface/tokenizers`` is installed we always
 * prefer it.
 */
function makeMinimalTokenizer(json: unknown): Tokenizer {
  const obj = json as {
    model: {
      type: string;
      vocab: [string, number][];
      unk_id?: number;
    };
    added_tokens?: { id: number; content: string }[];
  };
  if (obj.model.type !== 'Unigram') {
    throw new Error(
      `minimal tokenizer only supports Unigram; got ${obj.model.type}. ` +
        'Install @huggingface/tokenizers for full support.',
    );
  }
  const vocab = new Map<string, number>();
  const scores = new Map<string, number>();
  for (const [piece, score] of obj.model.vocab) {
    vocab.set(piece, vocab.size);
    scores.set(piece, score);
  }
  const unk = obj.model.unk_id ?? vocab.get('<unk>') ?? 0;
  const specials = pickSpecials(json);

  const SPACE = '▁'; // ▁

  function encode(text: string, max: number): Encoding {
    // NFKC + space β†’ ▁ at word starts.
    const norm = text.normalize('NFKC');
    const piece = SPACE + norm.replace(/ /g, SPACE);

    // Naive greedy longest-prefix match (Unigram models train with
    // forward-DP; we approximate with greedy which is good enough
    // for short fragments). For accuracy-critical paths the user
    // should install @huggingface/tokenizers.
    const ids: number[] = [specials.cls];
    const tokens: Token[] = [
      { id: specials.cls, text: '<s>', start: 0, end: 0 },
    ];
    let p = 1; // skip the leading SPACE we added
    let charPos = 0;
    while (p < piece.length && ids.length < max - 1) {
      let bestLen = 0;
      let bestId = unk;
      let bestText = '';
      for (let len = Math.min(piece.length - p, 24); len >= 1; len--) {
        const slice = piece.substring(p, p + len);
        const id = vocab.get(slice);
        if (id !== undefined) {
          bestLen = len;
          bestId = id;
          bestText = slice;
          break;
        }
      }
      if (bestLen === 0) {
        bestLen = 1;
        bestText = piece[p];
      }
      const charLen = bestText.replace(SPACE, ' ').length;
      const start = charPos;
      const end = charPos + charLen;
      tokens.push({ id: bestId, text: bestText, start, end });
      ids.push(bestId);
      p += bestLen;
      charPos = end;
    }
    ids.push(specials.sep);
    tokens.push({
      id: specials.sep,
      text: '</s>',
      start: charPos,
      end: charPos,
    });
    const attn = ids.map(() => 1n);
    return {
      inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
      attentionMask: BigInt64Array.from(attn),
      tokens,
    };
  }

  return {
    specials,
    tokenize(text, opts) {
      return encode(text, opts?.maxLength ?? 256);
    },
  };
}

function pickSpecials(json: unknown): {
  cls: number;
  sep: number;
  pad: number;
} {
  const obj = json as {
    added_tokens?: { id: number; content: string }[];
    model: { vocab: [string, number][] };
  };
  // XLM-R uses <s>/</s>/<pad>; some vocabs use [CLS]/[SEP]/[PAD].
  // Walk added_tokens first (authoritative) then fall back to vocab.
  const map = new Map<string, number>();
  if (obj.added_tokens) {
    for (const t of obj.added_tokens) map.set(t.content, t.id);
  }
  if (map.size === 0) {
    let i = 0;
    for (const [piece] of obj.model.vocab) map.set(piece, i++);
  }
  const cls = map.get('<s>') ?? map.get('[CLS]') ?? 0;
  const sep = map.get('</s>') ?? map.get('[SEP]') ?? 2;
  const pad = map.get('<pad>') ?? map.get('[PAD]') ?? 1;
  return { cls, sep, pad };
}