| | export type ClipTokenizerFiles = { |
| | vocabJson: string; |
| | mergesTxt: string; |
| | }; |
| |
|
| | type Vocab = Record<string, number>; |
| |
|
| | function bytesToUnicode() { |
| | const bs: number[] = []; |
| | for (let i = 33; i <= 126; i++) bs.push(i); |
| | for (let i = 161; i <= 172; i++) bs.push(i); |
| | for (let i = 174; i <= 255; i++) bs.push(i); |
| |
|
| | const cs = bs.slice(); |
| | let n = 0; |
| | for (let b = 0; b < 256; b++) { |
| | if (!bs.includes(b)) { |
| | bs.push(b); |
| | cs.push(256 + n); |
| | n += 1; |
| | } |
| | } |
| |
|
| | const byteToUnicode = new Map<number, string>(); |
| | const unicodeToByte = new Map<string, number>(); |
| | for (let i = 0; i < bs.length; i++) { |
| | const ch = String.fromCharCode(cs[i]); |
| | byteToUnicode.set(bs[i], ch); |
| | unicodeToByte.set(ch, bs[i]); |
| | } |
| | return { byteToUnicode, unicodeToByte }; |
| | } |
| |
|
| | function getPairs(word: string[]) { |
| | const pairs = new Set<string>(); |
| | let prev = word[0]; |
| | for (let i = 1; i < word.length; i++) { |
| | const curr = word[i]; |
| | pairs.add(`${prev}\u0001${curr}`); |
| | prev = curr; |
| | } |
| | return pairs; |
| | } |
| |
|
| | function bpe(token: string, bpeRanks: Map<string, number>) { |
| | let word = token.split(''); |
| | if (word.length === 1) return token; |
| |
|
| | while (true) { |
| | const pairs = getPairs(word); |
| | if (pairs.size === 0) break; |
| |
|
| | let minPair: string | null = null; |
| | let minRank = Infinity; |
| | for (const p of pairs) { |
| | const r = bpeRanks.get(p); |
| | if (r != null && r < minRank) { |
| | minRank = r; |
| | minPair = p; |
| | } |
| | } |
| | if (minPair == null) break; |
| | const [first, second] = minPair.split('\u0001'); |
| |
|
| | const newWord: string[] = []; |
| | let i = 0; |
| | while (i < word.length) { |
| | const j = word.indexOf(first, i); |
| | if (j === -1) { |
| | newWord.push(...word.slice(i)); |
| | break; |
| | } |
| | newWord.push(...word.slice(i, j)); |
| | i = j; |
| | if (word[i] === first && i < word.length - 1 && word[i + 1] === second) { |
| | newWord.push(first + second); |
| | i += 2; |
| | } else { |
| | newWord.push(word[i]); |
| | i += 1; |
| | } |
| | } |
| | word = newWord; |
| | if (word.length === 1) break; |
| | } |
| | return word.join(' '); |
| | } |
| |
|
| | function utf8ToBytes(text: string): Uint8Array { |
| | |
| | const te = (globalThis as any).TextEncoder ? new (globalThis as any).TextEncoder() : null; |
| | if (te) return te.encode(text); |
| |
|
| | const out: number[] = []; |
| | for (let i = 0; i < text.length; i++) { |
| | let codePoint = text.charCodeAt(i); |
| | if (codePoint < 0x80) out.push(codePoint); |
| | else if (codePoint < 0x800) { |
| | out.push(0xc0 | (codePoint >> 6)); |
| | out.push(0x80 | (codePoint & 0x3f)); |
| | } else { |
| | out.push(0xe0 | (codePoint >> 12)); |
| | out.push(0x80 | ((codePoint >> 6) & 0x3f)); |
| | out.push(0x80 | (codePoint & 0x3f)); |
| | } |
| | } |
| | return Uint8Array.from(out); |
| | } |
| |
|
| | export function createClipTokenizer(files: ClipTokenizerFiles) { |
| | const vocab: Vocab = JSON.parse(files.vocabJson); |
| | const merges = files.mergesTxt |
| | .split('\n') |
| | .map((l) => l.trim()) |
| | .filter((l) => l && !l.startsWith('#')); |
| |
|
| | |
| | const mergePairs = merges[0].includes('version') ? merges.slice(1) : merges; |
| | const bpeRanks = new Map<string, number>(); |
| | for (let i = 0; i < mergePairs.length; i++) { |
| | const parts = mergePairs[i].split(/\s+/); |
| | if (parts.length !== 2) continue; |
| | bpeRanks.set(`${parts[0]}\u0001${parts[1]}`, i); |
| | } |
| |
|
| | const { byteToUnicode } = bytesToUnicode(); |
| |
|
| | const bosToken = '<|startoftext|>'; |
| | const eosToken = '<|endoftext|>'; |
| | const bosId = vocab[bosToken] ?? 49406; |
| | const eosId = vocab[eosToken] ?? 49407; |
| |
|
| | const cache = new Map<string, number[]>(); |
| |
|
| | |
| | const pat = |
| | /'s|'t|'re|'ve|'m|'ll|'d| ?[A-Za-z]+| ?\\d+| ?[^\\sA-Za-z\\d]+|\\s+(?!\\S)|\\s+/g; |
| |
|
| | function encode(text: string, maxLen = 77) { |
| | const cached = cache.get(`${text}\u0001${maxLen}`); |
| | if (cached) return cached.slice(); |
| |
|
| | const matches = text.match(pat) ?? []; |
| | const bpeTokens: number[] = []; |
| |
|
| | for (const m of matches) { |
| | const bytes = utf8ToBytes(m); |
| | let chars = ''; |
| | for (const b of bytes) chars += byteToUnicode.get(b) ?? ''; |
| |
|
| | const bpeOut = bpe(chars, bpeRanks); |
| | for (const tok of bpeOut.split(' ')) { |
| | const id = vocab[tok]; |
| | bpeTokens.push(id == null ? eosId : id); |
| | } |
| | } |
| |
|
| | const ids: number[] = [bosId, ...bpeTokens, eosId]; |
| | if (ids.length > maxLen) ids.length = maxLen; |
| | while (ids.length < maxLen) ids.push(eosId); |
| |
|
| | cache.set(`${text}\u0001${maxLen}`, ids.slice()); |
| | return ids; |
| | } |
| |
|
| | return { encode, bosId, eosId }; |
| | } |
| |
|
| |
|