Spaces:
Paused
Paused
| // Helpers for enforcing embedding model input size limits. | |
| // | |
| // We use UTF-8 byte length as a conservative upper bound for tokenizer output. | |
| // Tokenizers operate over bytes; a token must contain at least one byte, so | |
| // token_count <= utf8_byte_length. | |
| export function estimateUtf8Bytes(text: string): number { | |
| if (!text) { | |
| return 0; | |
| } | |
| return Buffer.byteLength(text, "utf8"); | |
| } | |
| export function splitTextToUtf8ByteLimit(text: string, maxUtf8Bytes: number): string[] { | |
| if (maxUtf8Bytes <= 0) { | |
| return [text]; | |
| } | |
| if (estimateUtf8Bytes(text) <= maxUtf8Bytes) { | |
| return [text]; | |
| } | |
| const parts: string[] = []; | |
| let cursor = 0; | |
| while (cursor < text.length) { | |
| // The number of UTF-16 code units is always <= the number of UTF-8 bytes. | |
| // This makes `cursor + maxUtf8Bytes` a safe upper bound on the next split point. | |
| let low = cursor + 1; | |
| let high = Math.min(text.length, cursor + maxUtf8Bytes); | |
| let best = cursor; | |
| while (low <= high) { | |
| const mid = Math.floor((low + high) / 2); | |
| const bytes = estimateUtf8Bytes(text.slice(cursor, mid)); | |
| if (bytes <= maxUtf8Bytes) { | |
| best = mid; | |
| low = mid + 1; | |
| } else { | |
| high = mid - 1; | |
| } | |
| } | |
| if (best <= cursor) { | |
| best = Math.min(text.length, cursor + 1); | |
| } | |
| // Avoid splitting inside a surrogate pair. | |
| if ( | |
| best < text.length && | |
| best > cursor && | |
| text.charCodeAt(best - 1) >= 0xd800 && | |
| text.charCodeAt(best - 1) <= 0xdbff && | |
| text.charCodeAt(best) >= 0xdc00 && | |
| text.charCodeAt(best) <= 0xdfff | |
| ) { | |
| best -= 1; | |
| } | |
| const part = text.slice(cursor, best); | |
| if (!part) { | |
| break; | |
| } | |
| parts.push(part); | |
| cursor = best; | |
| } | |
| return parts; | |
| } | |