Spaces:
Paused
Paused
| import streamlit as st | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from typing import List, Tuple, Dict | |
| class DocumentChunker: | |
| def __init__( | |
| self, | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200, | |
| length_function: callable = len, | |
| max_tokens_per_chunk: int = 2000 | |
| ): | |
| """Initialize the document chunker with configurable parameters. | |
| Args: | |
| chunk_size: The target size of each text chunk | |
| chunk_overlap: The number of characters that overlap between chunks | |
| length_function: Function to measure text length (default: character count) | |
| max_tokens_per_chunk: Maximum number of tokens allowed per chunk | |
| """ | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=length_function, | |
| separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] | |
| ) | |
| self.max_tokens_per_chunk = max_tokens_per_chunk | |
| def chunk_document(self, content: str, metadata: dict) -> List[dict]: | |
| """Split document content into chunks with metadata. | |
| Args: | |
| content: The document text content | |
| metadata: Dictionary containing document metadata (e.g., source, page numbers) | |
| Returns: | |
| List of dictionaries containing chunks and their metadata | |
| """ | |
| try: | |
| # Split the text into chunks | |
| chunks = self.text_splitter.create_documents( | |
| texts=[content], | |
| metadatas=[metadata] | |
| ) | |
| # Process chunks and ensure they don't exceed token limit | |
| processed_chunks = [] | |
| for i, chunk in enumerate(chunks): | |
| # Estimate tokens (rough approximation: 4 chars ≈ 1 token) | |
| estimated_tokens = len(chunk.page_content) // 4 | |
| if estimated_tokens > self.max_tokens_per_chunk: | |
| # Further split if chunk is too large | |
| sub_chunks = self.text_splitter.create_documents( | |
| texts=[chunk.page_content], | |
| metadatas=[{**chunk.metadata, 'sub_chunk': i}] | |
| ) | |
| processed_chunks.extend(sub_chunks) | |
| else: | |
| processed_chunks.append(chunk) | |
| return processed_chunks | |
| except Exception as e: | |
| st.error(f"Error chunking document: {str(e)}") | |
| return [] | |
| def process_documents( | |
| self, | |
| documents: List[Tuple[str, str]] | |
| ) -> Tuple[List[str], List[dict]]: | |
| """Process multiple documents and their metadata. | |
| Args: | |
| documents: List of tuples containing (content, filename) | |
| Returns: | |
| Tuple containing (list of chunk contents, list of chunk metadata) | |
| """ | |
| all_chunks = [] | |
| all_metadatas = [] | |
| for content, filename in documents: | |
| metadata = { | |
| 'source': filename, | |
| 'chunk_index': 0 # Will be updated for each chunk | |
| } | |
| chunks = self.chunk_document(content, metadata) | |
| for i, chunk in enumerate(chunks): | |
| chunk.metadata['chunk_index'] = i | |
| all_chunks.append(chunk.page_content) | |
| all_metadatas.append(chunk.metadata) | |
| return all_chunks, all_metadatas |