Spaces:

vn6295337
/

RAG-document-assistant

Sleeping

App Files Files Community

RAG-document-assistant / frontend /src /api /chunker.js

vn6295337's picture

Initial commit: RAG Document Assistant with Zero-Storage Privacy

f866820 about 1 month ago

history blame contribute delete

2.27 kB

	/**
	* Client-side text chunking utilities
	* Chunks text into smaller pieces for embedding
	* Tracks character positions for later re-extraction (zero-storage)
	*/

	const DEFAULT_CHUNK_SIZE = 500; // characters
	const DEFAULT_OVERLAP = 50;

	/**
	* Split text into chunks with overlap, tracking positions
	* Returns chunks with start/end character positions for re-extraction
	*/
	export function chunkText(text, options = {}) {
	const {
	chunkSize = DEFAULT_CHUNK_SIZE,
	overlap = DEFAULT_OVERLAP,
	} = options;

	if (!text \|\| text.length === 0) {
	return [];
	}

	const chunks = [];
	let start = 0;

	while (start < text.length) {
	let end = Math.min(start + chunkSize, text.length);

	// Try to break at sentence boundary
	if (end < text.length) {
	const searchStart = Math.max(start + chunkSize - 100, start);
	const searchText = text.slice(searchStart, end + 50);
	const sentenceEnd = searchText.search(/[.!?]\s+/);
	if (sentenceEnd > 0) {
	end = searchStart + sentenceEnd + 1;
	}
	}

	const chunkText = text.slice(start, end).trim();
	if (chunkText.length > 0) {
	chunks.push({
	text: chunkText,
	startChar: start,
	endChar: end,
	});
	}

	// Move start with overlap
	start = end - overlap;
	if (start >= text.length - overlap) break;
	}

	return chunks;
	}

	/**
	* Chunk multiple files and prepare for embedding
	* Includes file path and character positions for re-fetching
	*/
	export function chunkFiles(files, options = {}) {
	const allChunks = [];

	for (const file of files) {
	const chunks = chunkText(file.content, options);

	chunks.forEach((chunk, index) => {
	allChunks.push({
	text: chunk.text,
	metadata: {
	filename: file.name,
	fileId: file.id,
	filePath: file.path, // Dropbox path for re-fetching
	chunkIndex: index,
	totalChunks: chunks.length,
	startChar: chunk.startChar,
	endChar: chunk.endChar,
	},
	});
	});
	}

	return allChunks;
	}

	/**
	* Estimate token count (rough approximation)
	*/
	export function estimateTokens(text) {
	// Rough estimate: ~4 characters per token for English
	return Math.ceil(text.length / 4);
	}