Spaces:
Paused
Paused
File size: 3,611 Bytes
354813e 2515ab9 354813e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Tuple, Dict
class DocumentChunker:
def __init__(
self,
chunk_size: int = 1000,
chunk_overlap: int = 200,
length_function: callable = len,
max_tokens_per_chunk: int = 2000
):
"""Initialize the document chunker with configurable parameters.
Args:
chunk_size: The target size of each text chunk
chunk_overlap: The number of characters that overlap between chunks
length_function: Function to measure text length (default: character count)
max_tokens_per_chunk: Maximum number of tokens allowed per chunk
"""
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
self.max_tokens_per_chunk = max_tokens_per_chunk
def chunk_document(self, content: str, metadata: dict) -> List[dict]:
"""Split document content into chunks with metadata.
Args:
content: The document text content
metadata: Dictionary containing document metadata (e.g., source, page numbers)
Returns:
List of dictionaries containing chunks and their metadata
"""
try:
# Split the text into chunks
chunks = self.text_splitter.create_documents(
texts=[content],
metadatas=[metadata]
)
# Process chunks and ensure they don't exceed token limit
processed_chunks = []
for i, chunk in enumerate(chunks):
# Estimate tokens (rough approximation: 4 chars ≈ 1 token)
estimated_tokens = len(chunk.page_content) // 4
if estimated_tokens > self.max_tokens_per_chunk:
# Further split if chunk is too large
sub_chunks = self.text_splitter.create_documents(
texts=[chunk.page_content],
metadatas=[{**chunk.metadata, 'sub_chunk': i}]
)
processed_chunks.extend(sub_chunks)
else:
processed_chunks.append(chunk)
return processed_chunks
except Exception as e:
st.error(f"Error chunking document: {str(e)}")
return []
def process_documents(
self,
documents: List[Tuple[str, str]]
) -> Tuple[List[str], List[dict]]:
"""Process multiple documents and their metadata.
Args:
documents: List of tuples containing (content, filename)
Returns:
Tuple containing (list of chunk contents, list of chunk metadata)
"""
all_chunks = []
all_metadatas = []
for content, filename in documents:
metadata = {
'source': filename,
'chunk_index': 0 # Will be updated for each chunk
}
chunks = self.chunk_document(content, metadata)
for i, chunk in enumerate(chunks):
chunk.metadata['chunk_index'] = i
all_chunks.append(chunk.page_content)
all_metadatas.append(chunk.metadata)
return all_chunks, all_metadatas |