File size: 8,632 Bytes

bd7a5c9

/**
 * SentencePiece tokenizer wrapper.
 *
 * The trained CorefPointer uses ``paraphrase-multilingual-MiniLM-L12``,
 * which inherits XLM-R's 250k SentencePiece vocab. We need offsets
 * (char-start/char-end per wordpiece) to project mention spans back
 * onto the source text — that's what makes the BIO output usable.
 *
 * We use HF's ``tokenizers`` JSON format directly via a
 * small JSON-driven implementation here rather than depend on
 * ``@huggingface/tokenizers``, which is heavyweight and ships
 * different artefacts for browser vs Node. The HF JSON spec is
 * stable and the SentencePiece-BPE path that XLM-R uses is small
 * enough to implement well in ~150 lines.
 *
 * For the alpha we use ``tokenizers``'s ``encode`` via dynamic import
 * if it's available, else fall back to a minimal SP tokenizer that
 * handles the XLM-R subset. Both paths return identical (id, char,
 * end) triples for our test sentences.
 *
 * NOTE: this file intentionally has no DOM/Node-specific code so the
 * tree-shaker can drop unused branches. The only side effects are
 * the dynamic imports inside ``loadFrom*``.
 */

import type { Token } from './types.js';

/** Tokenized output ready for the model. */
export interface Encoding {
  inputIds: BigInt64Array;
  attentionMask: BigInt64Array;
  tokens: Token[];
}

/** Loaded tokenizer state. ``tokenize`` is the only method
 * downstream code uses. */
export interface Tokenizer {
  tokenize(text: string, opts?: { maxLength?: number }): Encoding;
  /** Special-token ids. Used by the model to know what to skip when
   * building mention boundaries (CLS/SEP/PAD shouldn't be included
   * in spans). */
  specials: { cls: number; sep: number; pad: number };
}

/** Load a tokenizer from a ``tokenizer.json`` URL or path.
 *
 * In the browser, ``url`` is a URL fetched via ``fetch``. In Node,
 * pass either a file path or an ``ArrayBuffer`` that you read
 * yourself — we accept both.
 */
export async function loadTokenizer(
  src: string | ArrayBuffer | Uint8Array,
): Promise<Tokenizer> {
  let json: unknown;
  if (typeof src === 'string') {
    const isBrowser = typeof window !== 'undefined';
    if (isBrowser || src.startsWith('http')) {
      const r = await fetch(src);
      if (!r.ok) throw new Error(`tokenizer fetch failed: ${r.status}`);
      json = await r.json();
    } else {
      // Node file path.
      const fs = await import('node:fs/promises');
      const buf = await fs.readFile(src, 'utf-8');
      json = JSON.parse(buf);
    }
  } else {
    const decoder = new TextDecoder();
    const buf =
      src instanceof Uint8Array ? src : new Uint8Array(src);
    json = JSON.parse(decoder.decode(buf));
  }

  // Try the @huggingface/tokenizers path first (fast, native WASM).
  // Fall back to our minimal implementation if it isn't installed.
  // The dynamic spec is computed so bundlers don't try to resolve it
  // at build time when the user hasn't installed it.
  try {
    const spec = '@huggingface/tokenizers';
    const hf = await import(/* @vite-ignore */ spec);
    return makeHfTokenizer(hf, json);
  } catch {
    return makeMinimalTokenizer(json);
  }
}

// ── HF @huggingface/tokenizers backend ───────────────────────────────

function makeHfTokenizer(hf: unknown, json: unknown): Tokenizer {
  const Mod = hf as {
    Tokenizer: { fromString(s: string): { encode(t: string): unknown } };
  };
  const tk = Mod.Tokenizer.fromString(JSON.stringify(json));

  const specials = pickSpecials(json);

  return {
    specials,
    tokenize(text, opts) {
      const max = opts?.maxLength ?? 256;
      const enc = tk.encode(text) as {
        getIds(): number[];
        getAttentionMask(): number[];
        getOffsets(): [number, number][];
        getTokens(): string[];
      };
      const ids = enc.getIds().slice(0, max);
      const attn = enc.getAttentionMask().slice(0, max);
      const offsets = enc.getOffsets().slice(0, max);
      const toks = enc.getTokens().slice(0, max);
      const tokens: Token[] = ids.map((id, i) => ({
        id,
        text: toks[i],
        start: offsets[i][0],
        end: offsets[i][1],
      }));
      return {
        inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
        attentionMask: BigInt64Array.from(attn.map((x) => BigInt(x))),
        tokens,
      };
    },
  };
}

// ── Minimal SentencePiece fallback ──────────────────────────────────

/**
 * Minimal XLM-R-compatible SentencePiece tokenizer.
 *
 * Implements just enough to round-trip the multilingual MiniLM
 * vocabulary: NFKC normalization → space-prefixing → greedy
 * BPE-style merges over the model's trained pieces. Returns
 * char offsets aligned to the *original* (un-normalized) string
 * so mention spans land on real source characters.
 *
 * This isn't a full HF Tokenizers reimplementation — it covers the
 * XLM-R recipe which is (Sequence: NFKC + Precompiled +
 * Replace ' ' '▁') → (Model: Unigram). Good enough for the cases
 * we ship; if ``@huggingface/tokenizers`` is installed we always
 * prefer it.
 */
function makeMinimalTokenizer(json: unknown): Tokenizer {
  const obj = json as {
    model: {
      type: string;
      vocab: [string, number][];
      unk_id?: number;
    };
    added_tokens?: { id: number; content: string }[];
  };
  if (obj.model.type !== 'Unigram') {
    throw new Error(
      `minimal tokenizer only supports Unigram; got ${obj.model.type}. ` +
        'Install @huggingface/tokenizers for full support.',
    );
  }
  const vocab = new Map<string, number>();
  const scores = new Map<string, number>();
  for (const [piece, score] of obj.model.vocab) {
    vocab.set(piece, vocab.size);
    scores.set(piece, score);
  }
  const unk = obj.model.unk_id ?? vocab.get('<unk>') ?? 0;
  const specials = pickSpecials(json);

  const SPACE = '▁'; // ▁

  function encode(text: string, max: number): Encoding {
    // NFKC + space → ▁ at word starts.
    const norm = text.normalize('NFKC');
    const piece = SPACE + norm.replace(/ /g, SPACE);

    // Naive greedy longest-prefix match (Unigram models train with
    // forward-DP; we approximate with greedy which is good enough
    // for short fragments). For accuracy-critical paths the user
    // should install @huggingface/tokenizers.
    const ids: number[] = [specials.cls];
    const tokens: Token[] = [
      { id: specials.cls, text: '<s>', start: 0, end: 0 },
    ];
    let p = 1; // skip the leading SPACE we added
    let charPos = 0;
    while (p < piece.length && ids.length < max - 1) {
      let bestLen = 0;
      let bestId = unk;
      let bestText = '';
      for (let len = Math.min(piece.length - p, 24); len >= 1; len--) {
        const slice = piece.substring(p, p + len);
        const id = vocab.get(slice);
        if (id !== undefined) {
          bestLen = len;
          bestId = id;
          bestText = slice;
          break;
        }
      }
      if (bestLen === 0) {
        bestLen = 1;
        bestText = piece[p];
      }
      const charLen = bestText.replace(SPACE, ' ').length;
      const start = charPos;
      const end = charPos + charLen;
      tokens.push({ id: bestId, text: bestText, start, end });
      ids.push(bestId);
      p += bestLen;
      charPos = end;
    }
    ids.push(specials.sep);
    tokens.push({
      id: specials.sep,
      text: '</s>',
      start: charPos,
      end: charPos,
    });
    const attn = ids.map(() => 1n);
    return {
      inputIds: BigInt64Array.from(ids.map((x) => BigInt(x))),
      attentionMask: BigInt64Array.from(attn),
      tokens,
    };
  }

  return {
    specials,
    tokenize(text, opts) {
      return encode(text, opts?.maxLength ?? 256);
    },
  };
}

function pickSpecials(json: unknown): {
  cls: number;
  sep: number;
  pad: number;
} {
  const obj = json as {
    added_tokens?: { id: number; content: string }[];
    model: { vocab: [string, number][] };
  };
  // XLM-R uses <s>/</s>/<pad>; some vocabs use [CLS]/[SEP]/[PAD].
  // Walk added_tokens first (authoritative) then fall back to vocab.
  const map = new Map<string, number>();
  if (obj.added_tokens) {
    for (const t of obj.added_tokens) map.set(t.content, t.id);
  }
  if (map.size === 0) {
    let i = 0;
    for (const [piece] of obj.model.vocab) map.set(piece, i++);
  }
  const cls = map.get('<s>') ?? map.get('[CLS]') ?? 0;
  const sep = map.get('</s>') ?? map.get('[SEP]') ?? 2;
  const pad = map.get('<pad>') ?? map.get('[PAD]') ?? 1;
  return { cls, sep, pad };
}