RFP_Analyzer_Agent

Paused

App Files Files Community

cryogenic22 commited on Dec 5, 2024

Commit

354813e

verified ·

1 Parent(s): 09e8d84

Create document_chunker.py

Browse files

Files changed (1) hide show

utils/document_chunker.py +96 -0

utils/document_chunker.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from typing import List, Tuple
+import streamlit as st
+class DocumentChunker:
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        length_function: callable = len,
+        max_tokens_per_chunk: int = 2000
+    ):
+        """Initialize the document chunker with configurable parameters.
+        Args:
+            chunk_size: The target size of each text chunk
+            chunk_overlap: The number of characters that overlap between chunks
+            length_function: Function to measure text length (default: character count)
+            max_tokens_per_chunk: Maximum number of tokens allowed per chunk
+        """
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=length_function,
+            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
+        )
+        self.max_tokens_per_chunk = max_tokens_per_chunk
+    def chunk_document(self, content: str, metadata: dict) -> List[dict]:
+        """Split document content into chunks with metadata.
+        Args:
+            content: The document text content
+            metadata: Dictionary containing document metadata (e.g., source, page numbers)
+        Returns:
+            List of dictionaries containing chunks and their metadata
+        """
+        try:
+            # Split the text into chunks
+            chunks = self.text_splitter.create_documents(
+                texts=[content],
+                metadatas=[metadata]
+            )
+            # Process chunks and ensure they don't exceed token limit
+            processed_chunks = []
+            for i, chunk in enumerate(chunks):
+                # Estimate tokens (rough approximation: 4 chars ≈ 1 token)
+                estimated_tokens = len(chunk.page_content) // 4
+                if estimated_tokens > self.max_tokens_per_chunk:
+                    # Further split if chunk is too large
+                    sub_chunks = self.text_splitter.create_documents(
+                        texts=[chunk.page_content],
+                        metadatas=[{**chunk.metadata, 'sub_chunk': i}]
+                    )
+                    processed_chunks.extend(sub_chunks)
+                else:
+                    processed_chunks.append(chunk)
+            return processed_chunks
+        except Exception as e:
+            st.error(f"Error chunking document: {str(e)}")
+            return []
+    def process_documents(
+        self,
+        documents: List[Tuple[str, str]]
+    ) -> Tuple[List[str], List[dict]]:
+        """Process multiple documents and their metadata.
+        Args:
+            documents: List of tuples containing (content, filename)
+        Returns:
+            Tuple containing (list of chunk contents, list of chunk metadata)
+        """
+        all_chunks = []
+        all_metadatas = []
+        for content, filename in documents:
+            metadata = {
+                'source': filename,
+                'chunk_index': 0  # Will be updated for each chunk
+            }
+            chunks = self.chunk_document(content, metadata)
+            for i, chunk in enumerate(chunks):
+                chunk.metadata['chunk_index'] = i
+                all_chunks.append(chunk.page_content)
+                all_metadatas.append(chunk.metadata)
+        return all_chunks, all_metadatas