import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter from typing import List, Tuple, Dict class DocumentChunker: def __init__( self, chunk_size: int = 1000, chunk_overlap: int = 200, length_function: callable = len, max_tokens_per_chunk: int = 2000 ): """Initialize the document chunker with configurable parameters. Args: chunk_size: The target size of each text chunk chunk_overlap: The number of characters that overlap between chunks length_function: Function to measure text length (default: character count) max_tokens_per_chunk: Maximum number of tokens allowed per chunk """ self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=length_function, separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] ) self.max_tokens_per_chunk = max_tokens_per_chunk def chunk_document(self, content: str, metadata: dict) -> List[dict]: """Split document content into chunks with metadata. Args: content: The document text content metadata: Dictionary containing document metadata (e.g., source, page numbers) Returns: List of dictionaries containing chunks and their metadata """ try: # Split the text into chunks chunks = self.text_splitter.create_documents( texts=[content], metadatas=[metadata] ) # Process chunks and ensure they don't exceed token limit processed_chunks = [] for i, chunk in enumerate(chunks): # Estimate tokens (rough approximation: 4 chars ≈ 1 token) estimated_tokens = len(chunk.page_content) // 4 if estimated_tokens > self.max_tokens_per_chunk: # Further split if chunk is too large sub_chunks = self.text_splitter.create_documents( texts=[chunk.page_content], metadatas=[{**chunk.metadata, 'sub_chunk': i}] ) processed_chunks.extend(sub_chunks) else: processed_chunks.append(chunk) return processed_chunks except Exception as e: st.error(f"Error chunking document: {str(e)}") return [] def process_documents( self, documents: List[Tuple[str, str]] ) -> Tuple[List[str], List[dict]]: """Process multiple documents and their metadata. Args: documents: List of tuples containing (content, filename) Returns: Tuple containing (list of chunk contents, list of chunk metadata) """ all_chunks = [] all_metadatas = [] for content, filename in documents: metadata = { 'source': filename, 'chunk_index': 0 # Will be updated for each chunk } chunks = self.chunk_document(content, metadata) for i, chunk in enumerate(chunks): chunk.metadata['chunk_index'] = i all_chunks.append(chunk.page_content) all_metadatas.append(chunk.metadata) return all_chunks, all_metadatas