Spaces:

vn6295337
/

RAG-document-assistant

Running

File size: 2,266 Bytes

f866820

/**
 * Client-side text chunking utilities
 * Chunks text into smaller pieces for embedding
 * Tracks character positions for later re-extraction (zero-storage)
 */

const DEFAULT_CHUNK_SIZE = 500; // characters
const DEFAULT_OVERLAP = 50;

/**
 * Split text into chunks with overlap, tracking positions
 * Returns chunks with start/end character positions for re-extraction
 */
export function chunkText(text, options = {}) {
  const {
    chunkSize = DEFAULT_CHUNK_SIZE,
    overlap = DEFAULT_OVERLAP,
  } = options;

  if (!text || text.length === 0) {
    return [];
  }

  const chunks = [];
  let start = 0;

  while (start < text.length) {
    let end = Math.min(start + chunkSize, text.length);

    // Try to break at sentence boundary
    if (end < text.length) {
      const searchStart = Math.max(start + chunkSize - 100, start);
      const searchText = text.slice(searchStart, end + 50);
      const sentenceEnd = searchText.search(/[.!?]\s+/);
      if (sentenceEnd > 0) {
        end = searchStart + sentenceEnd + 1;
      }
    }

    const chunkText = text.slice(start, end).trim();
    if (chunkText.length > 0) {
      chunks.push({
        text: chunkText,
        startChar: start,
        endChar: end,
      });
    }

    // Move start with overlap
    start = end - overlap;
    if (start >= text.length - overlap) break;
  }

  return chunks;
}

/**
 * Chunk multiple files and prepare for embedding
 * Includes file path and character positions for re-fetching
 */
export function chunkFiles(files, options = {}) {
  const allChunks = [];

  for (const file of files) {
    const chunks = chunkText(file.content, options);

    chunks.forEach((chunk, index) => {
      allChunks.push({
        text: chunk.text,
        metadata: {
          filename: file.name,
          fileId: file.id,
          filePath: file.path, // Dropbox path for re-fetching
          chunkIndex: index,
          totalChunks: chunks.length,
          startChar: chunk.startChar,
          endChar: chunk.endChar,
        },
      });
    });
  }

  return allChunks;
}

/**
 * Estimate token count (rough approximation)
 */
export function estimateTokens(text) {
  // Rough estimate: ~4 characters per token for English
  return Math.ceil(text.length / 4);
}