Spaces:
Sleeping
Sleeping
| /** | |
| * Client-side text chunking utilities | |
| * Chunks text into smaller pieces for embedding | |
| * Tracks character positions for later re-extraction (zero-storage) | |
| */ | |
| const DEFAULT_CHUNK_SIZE = 500; // characters | |
| const DEFAULT_OVERLAP = 50; | |
| /** | |
| * Split text into chunks with overlap, tracking positions | |
| * Returns chunks with start/end character positions for re-extraction | |
| */ | |
| export function chunkText(text, options = {}) { | |
| const { | |
| chunkSize = DEFAULT_CHUNK_SIZE, | |
| overlap = DEFAULT_OVERLAP, | |
| } = options; | |
| if (!text || text.length === 0) { | |
| return []; | |
| } | |
| const chunks = []; | |
| let start = 0; | |
| while (start < text.length) { | |
| let end = Math.min(start + chunkSize, text.length); | |
| // Try to break at sentence boundary | |
| if (end < text.length) { | |
| const searchStart = Math.max(start + chunkSize - 100, start); | |
| const searchText = text.slice(searchStart, end + 50); | |
| const sentenceEnd = searchText.search(/[.!?]\s+/); | |
| if (sentenceEnd > 0) { | |
| end = searchStart + sentenceEnd + 1; | |
| } | |
| } | |
| const chunkText = text.slice(start, end).trim(); | |
| if (chunkText.length > 0) { | |
| chunks.push({ | |
| text: chunkText, | |
| startChar: start, | |
| endChar: end, | |
| }); | |
| } | |
| // Move start with overlap | |
| start = end - overlap; | |
| if (start >= text.length - overlap) break; | |
| } | |
| return chunks; | |
| } | |
| /** | |
| * Chunk multiple files and prepare for embedding | |
| * Includes file path and character positions for re-fetching | |
| */ | |
| export function chunkFiles(files, options = {}) { | |
| const allChunks = []; | |
| for (const file of files) { | |
| const chunks = chunkText(file.content, options); | |
| chunks.forEach((chunk, index) => { | |
| allChunks.push({ | |
| text: chunk.text, | |
| metadata: { | |
| filename: file.name, | |
| fileId: file.id, | |
| filePath: file.path, // Dropbox path for re-fetching | |
| chunkIndex: index, | |
| totalChunks: chunks.length, | |
| startChar: chunk.startChar, | |
| endChar: chunk.endChar, | |
| }, | |
| }); | |
| }); | |
| } | |
| return allChunks; | |
| } | |
| /** | |
| * Estimate token count (rough approximation) | |
| */ | |
| export function estimateTokens(text) { | |
| // Rough estimate: ~4 characters per token for English | |
| return Math.ceil(text.length / 4); | |
| } | |